group_cpus.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2016 Thomas Gleixner.
  4. * Copyright (C) 2016-2017 Christoph Hellwig.
  5. */
  6. #include <linux/kernel.h>
  7. #include <linux/slab.h>
  8. #include <linux/cpu.h>
  9. #include <linux/sort.h>
  10. #include <linux/group_cpus.h>
  11. #ifdef CONFIG_SMP
  12. static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
  13. unsigned int cpus_per_grp)
  14. {
  15. const struct cpumask *siblmsk;
  16. int cpu, sibl;
  17. for ( ; cpus_per_grp > 0; ) {
  18. cpu = cpumask_first(nmsk);
  19. /* Should not happen, but I'm too lazy to think about it */
  20. if (cpu >= nr_cpu_ids)
  21. return;
  22. cpumask_clear_cpu(cpu, nmsk);
  23. cpumask_set_cpu(cpu, irqmsk);
  24. cpus_per_grp--;
  25. /* If the cpu has siblings, use them first */
  26. siblmsk = topology_sibling_cpumask(cpu);
  27. for (sibl = -1; cpus_per_grp > 0; ) {
  28. sibl = cpumask_next(sibl, siblmsk);
  29. if (sibl >= nr_cpu_ids)
  30. break;
  31. if (!cpumask_test_and_clear_cpu(sibl, nmsk))
  32. continue;
  33. cpumask_set_cpu(sibl, irqmsk);
  34. cpus_per_grp--;
  35. }
  36. }
  37. }
  38. static cpumask_var_t *alloc_node_to_cpumask(void)
  39. {
  40. cpumask_var_t *masks;
  41. int node;
  42. masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
  43. if (!masks)
  44. return NULL;
  45. for (node = 0; node < nr_node_ids; node++) {
  46. if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
  47. goto out_unwind;
  48. }
  49. return masks;
  50. out_unwind:
  51. while (--node >= 0)
  52. free_cpumask_var(masks[node]);
  53. kfree(masks);
  54. return NULL;
  55. }
  56. static void free_node_to_cpumask(cpumask_var_t *masks)
  57. {
  58. int node;
  59. for (node = 0; node < nr_node_ids; node++)
  60. free_cpumask_var(masks[node]);
  61. kfree(masks);
  62. }
  63. static void build_node_to_cpumask(cpumask_var_t *masks)
  64. {
  65. int cpu;
  66. for_each_possible_cpu(cpu)
  67. cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
  68. }
  69. static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
  70. const struct cpumask *mask, nodemask_t *nodemsk)
  71. {
  72. int n, nodes = 0;
  73. /* Calculate the number of nodes in the supplied affinity mask */
  74. for_each_node(n) {
  75. if (cpumask_intersects(mask, node_to_cpumask[n])) {
  76. node_set(n, *nodemsk);
  77. nodes++;
  78. }
  79. }
  80. return nodes;
  81. }
  82. struct node_groups {
  83. unsigned id;
  84. union {
  85. unsigned ngroups;
  86. unsigned ncpus;
  87. };
  88. };
  89. static int ncpus_cmp_func(const void *l, const void *r)
  90. {
  91. const struct node_groups *ln = l;
  92. const struct node_groups *rn = r;
  93. return ln->ncpus - rn->ncpus;
  94. }
  95. /*
  96. * Allocate group number for each node, so that for each node:
  97. *
  98. * 1) the allocated number is >= 1
  99. *
  100. * 2) the allocated number is <= active CPU number of this node
  101. *
  102. * The actual allocated total groups may be less than @numgrps when
  103. * active total CPU number is less than @numgrps.
  104. *
  105. * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
  106. * for each node.
  107. */
  108. static void alloc_nodes_groups(unsigned int numgrps,
  109. cpumask_var_t *node_to_cpumask,
  110. const struct cpumask *cpu_mask,
  111. const nodemask_t nodemsk,
  112. struct cpumask *nmsk,
  113. struct node_groups *node_groups)
  114. {
  115. unsigned n, remaining_ncpus = 0;
  116. for (n = 0; n < nr_node_ids; n++) {
  117. node_groups[n].id = n;
  118. node_groups[n].ncpus = UINT_MAX;
  119. }
  120. for_each_node_mask(n, nodemsk) {
  121. unsigned ncpus;
  122. cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
  123. ncpus = cpumask_weight(nmsk);
  124. if (!ncpus)
  125. continue;
  126. remaining_ncpus += ncpus;
  127. node_groups[n].ncpus = ncpus;
  128. }
  129. numgrps = min_t(unsigned, remaining_ncpus, numgrps);
  130. sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
  131. ncpus_cmp_func, NULL);
  132. /*
  133. * Allocate groups for each node according to the ratio of this
  134. * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
  135. * bigger than number of active numa nodes. Always start the
  136. * allocation from the node with minimized nr_cpus.
  137. *
  138. * This way guarantees that each active node gets allocated at
  139. * least one group, and the theory is simple: over-allocation
  140. * is only done when this node is assigned by one group, so
  141. * other nodes will be allocated >= 1 groups, since 'numgrps' is
  142. * bigger than number of numa nodes.
  143. *
  144. * One perfect invariant is that number of allocated groups for
  145. * each node is <= CPU count of this node:
  146. *
  147. * 1) suppose there are two nodes: A and B
  148. * ncpu(X) is CPU count of node X
  149. * grps(X) is the group count allocated to node X via this
  150. * algorithm
  151. *
  152. * ncpu(A) <= ncpu(B)
  153. * ncpu(A) + ncpu(B) = N
  154. * grps(A) + grps(B) = G
  155. *
  156. * grps(A) = max(1, round_down(G * ncpu(A) / N))
  157. * grps(B) = G - grps(A)
  158. *
  159. * both N and G are integer, and 2 <= G <= N, suppose
  160. * G = N - delta, and 0 <= delta <= N - 2
  161. *
  162. * 2) obviously grps(A) <= ncpu(A) because:
  163. *
  164. * if grps(A) is 1, then grps(A) <= ncpu(A) given
  165. * ncpu(A) >= 1
  166. *
  167. * otherwise,
  168. * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
  169. *
  170. * 3) prove how grps(B) <= ncpu(B):
  171. *
  172. * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
  173. * over-allocated, so grps(B) <= ncpu(B),
  174. *
  175. * otherwise:
  176. *
  177. * grps(A) =
  178. * round_down(G * ncpu(A) / N) =
  179. * round_down((N - delta) * ncpu(A) / N) =
  180. * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
  181. * round_down((N * ncpu(A) - delta * N) / N) =
  182. * cpu(A) - delta
  183. *
  184. * then:
  185. *
  186. * grps(A) - G >= ncpu(A) - delta - G
  187. * =>
  188. * G - grps(A) <= G + delta - ncpu(A)
  189. * =>
  190. * grps(B) <= N - ncpu(A)
  191. * =>
  192. * grps(B) <= cpu(B)
  193. *
  194. * For nodes >= 3, it can be thought as one node and another big
  195. * node given that is exactly what this algorithm is implemented,
  196. * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
  197. * finally for each node X: grps(X) <= ncpu(X).
  198. *
  199. */
  200. for (n = 0; n < nr_node_ids; n++) {
  201. unsigned ngroups, ncpus;
  202. if (node_groups[n].ncpus == UINT_MAX)
  203. continue;
  204. WARN_ON_ONCE(numgrps == 0);
  205. ncpus = node_groups[n].ncpus;
  206. ngroups = max_t(unsigned, 1,
  207. numgrps * ncpus / remaining_ncpus);
  208. WARN_ON_ONCE(ngroups > ncpus);
  209. node_groups[n].ngroups = ngroups;
  210. remaining_ncpus -= ncpus;
  211. numgrps -= ngroups;
  212. }
  213. }
  214. static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  215. cpumask_var_t *node_to_cpumask,
  216. const struct cpumask *cpu_mask,
  217. struct cpumask *nmsk, struct cpumask *masks)
  218. {
  219. unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
  220. unsigned int last_grp = numgrps;
  221. unsigned int curgrp = startgrp;
  222. nodemask_t nodemsk = NODE_MASK_NONE;
  223. struct node_groups *node_groups;
  224. if (cpumask_empty(cpu_mask))
  225. return 0;
  226. nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
  227. /*
  228. * If the number of nodes in the mask is greater than or equal the
  229. * number of groups we just spread the groups across the nodes.
  230. */
  231. if (numgrps <= nodes) {
  232. for_each_node_mask(n, nodemsk) {
  233. /* Ensure that only CPUs which are in both masks are set */
  234. cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
  235. cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
  236. if (++curgrp == last_grp)
  237. curgrp = 0;
  238. }
  239. return numgrps;
  240. }
  241. node_groups = kcalloc(nr_node_ids,
  242. sizeof(struct node_groups),
  243. GFP_KERNEL);
  244. if (!node_groups)
  245. return -ENOMEM;
  246. /* allocate group number for each node */
  247. alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
  248. nodemsk, nmsk, node_groups);
  249. for (i = 0; i < nr_node_ids; i++) {
  250. unsigned int ncpus, v;
  251. struct node_groups *nv = &node_groups[i];
  252. if (nv->ngroups == UINT_MAX)
  253. continue;
  254. /* Get the cpus on this node which are in the mask */
  255. cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
  256. ncpus = cpumask_weight(nmsk);
  257. if (!ncpus)
  258. continue;
  259. WARN_ON_ONCE(nv->ngroups > ncpus);
  260. /* Account for rounding errors */
  261. extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
  262. /* Spread allocated groups on CPUs of the current node */
  263. for (v = 0; v < nv->ngroups; v++, curgrp++) {
  264. cpus_per_grp = ncpus / nv->ngroups;
  265. /* Account for extra groups to compensate rounding errors */
  266. if (extra_grps) {
  267. cpus_per_grp++;
  268. --extra_grps;
  269. }
  270. /*
  271. * wrapping has to be considered given 'startgrp'
  272. * may start anywhere
  273. */
  274. if (curgrp >= last_grp)
  275. curgrp = 0;
  276. grp_spread_init_one(&masks[curgrp], nmsk,
  277. cpus_per_grp);
  278. }
  279. done += nv->ngroups;
  280. }
  281. kfree(node_groups);
  282. return done;
  283. }
  284. /**
  285. * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
  286. * @numgrps: number of groups
  287. *
  288. * Return: cpumask array if successful, NULL otherwise. And each element
  289. * includes CPUs assigned to this group
  290. *
  291. * Try to put close CPUs from viewpoint of CPU and NUMA locality into
  292. * same group, and run two-stage grouping:
  293. * 1) allocate present CPUs on these groups evenly first
  294. * 2) allocate other possible CPUs on these groups evenly
  295. *
  296. * We guarantee in the resulted grouping that all CPUs are covered, and
  297. * no same CPU is assigned to multiple groups
  298. */
  299. struct cpumask *group_cpus_evenly(unsigned int numgrps)
  300. {
  301. unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
  302. cpumask_var_t *node_to_cpumask;
  303. cpumask_var_t nmsk, npresmsk;
  304. int ret = -ENOMEM;
  305. struct cpumask *masks = NULL;
  306. if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
  307. return NULL;
  308. if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
  309. goto fail_nmsk;
  310. node_to_cpumask = alloc_node_to_cpumask();
  311. if (!node_to_cpumask)
  312. goto fail_npresmsk;
  313. masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
  314. if (!masks)
  315. goto fail_node_to_cpumask;
  316. build_node_to_cpumask(node_to_cpumask);
  317. /*
  318. * Make a local cache of 'cpu_present_mask', so the two stages
  319. * spread can observe consistent 'cpu_present_mask' without holding
  320. * cpu hotplug lock, then we can reduce deadlock risk with cpu
  321. * hotplug code.
  322. *
  323. * Here CPU hotplug may happen when reading `cpu_present_mask`, and
  324. * we can live with the case because it only affects that hotplug
  325. * CPU is handled in the 1st or 2nd stage, and either way is correct
  326. * from API user viewpoint since 2-stage spread is sort of
  327. * optimization.
  328. */
  329. cpumask_copy(npresmsk, data_race(cpu_present_mask));
  330. /* grouping present CPUs first */
  331. ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
  332. npresmsk, nmsk, masks);
  333. if (ret < 0)
  334. goto fail_build_affinity;
  335. nr_present = ret;
  336. /*
  337. * Allocate non present CPUs starting from the next group to be
  338. * handled. If the grouping of present CPUs already exhausted the
  339. * group space, assign the non present CPUs to the already
  340. * allocated out groups.
  341. */
  342. if (nr_present >= numgrps)
  343. curgrp = 0;
  344. else
  345. curgrp = nr_present;
  346. cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
  347. ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
  348. npresmsk, nmsk, masks);
  349. if (ret >= 0)
  350. nr_others = ret;
  351. fail_build_affinity:
  352. if (ret >= 0)
  353. WARN_ON(nr_present + nr_others < numgrps);
  354. fail_node_to_cpumask:
  355. free_node_to_cpumask(node_to_cpumask);
  356. fail_npresmsk:
  357. free_cpumask_var(npresmsk);
  358. fail_nmsk:
  359. free_cpumask_var(nmsk);
  360. if (ret < 0) {
  361. kfree(masks);
  362. return NULL;
  363. }
  364. return masks;
  365. }
  366. #else /* CONFIG_SMP */
  367. struct cpumask *group_cpus_evenly(unsigned int numgrps)
  368. {
  369. struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
  370. if (!masks)
  371. return NULL;
  372. /* assign all CPUs(cpu 0) to the 1st group only */
  373. cpumask_copy(&masks[0], cpu_possible_mask);
  374. return masks;
  375. }
  376. #endif /* CONFIG_SMP */
  377. EXPORT_SYMBOL_GPL(group_cpus_evenly);