numa_emulation.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * NUMA emulation
  4. */
  5. #include <linux/kernel.h>
  6. #include <linux/errno.h>
  7. #include <linux/topology.h>
  8. #include <linux/memblock.h>
  9. #include <linux/numa_memblks.h>
  10. #include <asm/numa.h>
  11. #define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
  12. #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
  13. static int emu_nid_to_phys[MAX_NUMNODES];
  14. static char *emu_cmdline __initdata;
  15. int __init numa_emu_cmdline(char *str)
  16. {
  17. emu_cmdline = str;
  18. return 0;
  19. }
  20. static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  21. {
  22. int i;
  23. for (i = 0; i < mi->nr_blks; i++)
  24. if (mi->blk[i].nid == nid)
  25. return i;
  26. return -ENOENT;
  27. }
  28. static u64 __init mem_hole_size(u64 start, u64 end)
  29. {
  30. unsigned long start_pfn = PFN_UP(start);
  31. unsigned long end_pfn = PFN_DOWN(end);
  32. if (start_pfn < end_pfn)
  33. return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
  34. return 0;
  35. }
  36. /*
  37. * Sets up nid to range from @start to @end. The return value is -errno if
  38. * something went wrong, 0 otherwise.
  39. */
  40. static int __init emu_setup_memblk(struct numa_meminfo *ei,
  41. struct numa_meminfo *pi,
  42. int nid, int phys_blk, u64 size)
  43. {
  44. struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  45. struct numa_memblk *pb = &pi->blk[phys_blk];
  46. if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  47. pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  48. return -EINVAL;
  49. }
  50. ei->nr_blks++;
  51. eb->start = pb->start;
  52. eb->end = pb->start + size;
  53. eb->nid = nid;
  54. if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  55. emu_nid_to_phys[nid] = pb->nid;
  56. pb->start += size;
  57. if (pb->start >= pb->end) {
  58. WARN_ON_ONCE(pb->start > pb->end);
  59. numa_remove_memblk_from(phys_blk, pi);
  60. }
  61. printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
  62. nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
  63. return 0;
  64. }
  65. /*
  66. * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  67. * to max_addr.
  68. *
  69. * Returns zero on success or negative on error.
  70. */
  71. static int __init split_nodes_interleave(struct numa_meminfo *ei,
  72. struct numa_meminfo *pi,
  73. u64 addr, u64 max_addr, int nr_nodes)
  74. {
  75. nodemask_t physnode_mask = numa_nodes_parsed;
  76. u64 size;
  77. int big;
  78. int nid = 0;
  79. int i, ret;
  80. if (nr_nodes <= 0)
  81. return -1;
  82. if (nr_nodes > MAX_NUMNODES) {
  83. pr_info("numa=fake=%d too large, reducing to %d\n",
  84. nr_nodes, MAX_NUMNODES);
  85. nr_nodes = MAX_NUMNODES;
  86. }
  87. /*
  88. * Calculate target node size. x86_32 freaks on __udivdi3() so do
  89. * the division in ulong number of pages and convert back.
  90. */
  91. size = max_addr - addr - mem_hole_size(addr, max_addr);
  92. size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
  93. /*
  94. * Calculate the number of big nodes that can be allocated as a result
  95. * of consolidating the remainder.
  96. */
  97. big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
  98. FAKE_NODE_MIN_SIZE;
  99. size &= FAKE_NODE_MIN_HASH_MASK;
  100. if (!size) {
  101. pr_err("Not enough memory for each node. "
  102. "NUMA emulation disabled.\n");
  103. return -1;
  104. }
  105. /*
  106. * Continue to fill physical nodes with fake nodes until there is no
  107. * memory left on any of them.
  108. */
  109. while (!nodes_empty(physnode_mask)) {
  110. for_each_node_mask(i, physnode_mask) {
  111. u64 dma32_end = numa_emu_dma_end();
  112. u64 start, limit, end;
  113. int phys_blk;
  114. phys_blk = emu_find_memblk_by_nid(i, pi);
  115. if (phys_blk < 0) {
  116. node_clear(i, physnode_mask);
  117. continue;
  118. }
  119. start = pi->blk[phys_blk].start;
  120. limit = pi->blk[phys_blk].end;
  121. end = start + size;
  122. if (nid < big)
  123. end += FAKE_NODE_MIN_SIZE;
  124. /*
  125. * Continue to add memory to this fake node if its
  126. * non-reserved memory is less than the per-node size.
  127. */
  128. while (end - start - mem_hole_size(start, end) < size) {
  129. end += FAKE_NODE_MIN_SIZE;
  130. if (end > limit) {
  131. end = limit;
  132. break;
  133. }
  134. }
  135. /*
  136. * If there won't be at least FAKE_NODE_MIN_SIZE of
  137. * non-reserved memory in ZONE_DMA32 for the next node,
  138. * this one must extend to the boundary.
  139. */
  140. if (end < dma32_end && dma32_end - end -
  141. mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  142. end = dma32_end;
  143. /*
  144. * If there won't be enough non-reserved memory for the
  145. * next node, this one must extend to the end of the
  146. * physical node.
  147. */
  148. if (limit - end - mem_hole_size(end, limit) < size)
  149. end = limit;
  150. ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
  151. phys_blk,
  152. min(end, limit) - start);
  153. if (ret < 0)
  154. return ret;
  155. }
  156. }
  157. return 0;
  158. }
  159. /*
  160. * Returns the end address of a node so that there is at least `size' amount of
  161. * non-reserved memory or `max_addr' is reached.
  162. */
  163. static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
  164. {
  165. u64 end = start + size;
  166. while (end - start - mem_hole_size(start, end) < size) {
  167. end += FAKE_NODE_MIN_SIZE;
  168. if (end > max_addr) {
  169. end = max_addr;
  170. break;
  171. }
  172. }
  173. return end;
  174. }
  175. static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
  176. {
  177. unsigned long max_pfn = PHYS_PFN(max_addr);
  178. unsigned long base_pfn = PHYS_PFN(base);
  179. unsigned long hole_pfns = PHYS_PFN(hole);
  180. return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
  181. }
  182. /*
  183. * Sets up fake nodes of `size' interleaved over physical nodes ranging from
  184. * `addr' to `max_addr'.
  185. *
  186. * Returns zero on success or negative on error.
  187. */
  188. static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
  189. struct numa_meminfo *pi,
  190. u64 addr, u64 max_addr, u64 size,
  191. int nr_nodes, struct numa_memblk *pblk,
  192. int nid)
  193. {
  194. nodemask_t physnode_mask = numa_nodes_parsed;
  195. int i, ret, uniform = 0;
  196. u64 min_size;
  197. if ((!size && !nr_nodes) || (nr_nodes && !pblk))
  198. return -1;
  199. /*
  200. * In the 'uniform' case split the passed in physical node by
  201. * nr_nodes, in the non-uniform case, ignore the passed in
  202. * physical block and try to create nodes of at least size
  203. * @size.
  204. *
  205. * In the uniform case, split the nodes strictly by physical
  206. * capacity, i.e. ignore holes. In the non-uniform case account
  207. * for holes and treat @size as a minimum floor.
  208. */
  209. if (!nr_nodes)
  210. nr_nodes = MAX_NUMNODES;
  211. else {
  212. nodes_clear(physnode_mask);
  213. node_set(pblk->nid, physnode_mask);
  214. uniform = 1;
  215. }
  216. if (uniform) {
  217. min_size = uniform_size(max_addr, addr, 0, nr_nodes);
  218. size = min_size;
  219. } else {
  220. /*
  221. * The limit on emulated nodes is MAX_NUMNODES, so the
  222. * size per node is increased accordingly if the
  223. * requested size is too small. This creates a uniform
  224. * distribution of node sizes across the entire machine
  225. * (but not necessarily over physical nodes).
  226. */
  227. min_size = uniform_size(max_addr, addr,
  228. mem_hole_size(addr, max_addr), nr_nodes);
  229. }
  230. min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
  231. if (size < min_size) {
  232. pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
  233. size >> 20, min_size >> 20);
  234. size = min_size;
  235. }
  236. size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
  237. /*
  238. * Fill physical nodes with fake nodes of size until there is no memory
  239. * left on any of them.
  240. */
  241. while (!nodes_empty(physnode_mask)) {
  242. for_each_node_mask(i, physnode_mask) {
  243. u64 dma32_end = numa_emu_dma_end();
  244. u64 start, limit, end;
  245. int phys_blk;
  246. phys_blk = emu_find_memblk_by_nid(i, pi);
  247. if (phys_blk < 0) {
  248. node_clear(i, physnode_mask);
  249. continue;
  250. }
  251. start = pi->blk[phys_blk].start;
  252. limit = pi->blk[phys_blk].end;
  253. if (uniform)
  254. end = start + size;
  255. else
  256. end = find_end_of_node(start, limit, size);
  257. /*
  258. * If there won't be at least FAKE_NODE_MIN_SIZE of
  259. * non-reserved memory in ZONE_DMA32 for the next node,
  260. * this one must extend to the boundary.
  261. */
  262. if (end < dma32_end && dma32_end - end -
  263. mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  264. end = dma32_end;
  265. /*
  266. * If there won't be enough non-reserved memory for the
  267. * next node, this one must extend to the end of the
  268. * physical node.
  269. */
  270. if ((limit - end - mem_hole_size(end, limit) < size)
  271. && !uniform)
  272. end = limit;
  273. ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
  274. phys_blk,
  275. min(end, limit) - start);
  276. if (ret < 0)
  277. return ret;
  278. }
  279. }
  280. return nid;
  281. }
  282. static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
  283. struct numa_meminfo *pi,
  284. u64 addr, u64 max_addr, u64 size)
  285. {
  286. return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
  287. 0, NULL, 0);
  288. }
  289. static int __init setup_emu2phys_nid(int *dfl_phys_nid)
  290. {
  291. int i, max_emu_nid = 0;
  292. *dfl_phys_nid = NUMA_NO_NODE;
  293. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
  294. if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
  295. max_emu_nid = i;
  296. if (*dfl_phys_nid == NUMA_NO_NODE)
  297. *dfl_phys_nid = emu_nid_to_phys[i];
  298. }
  299. }
  300. return max_emu_nid;
  301. }
  302. /**
  303. * numa_emulation - Emulate NUMA nodes
  304. * @numa_meminfo: NUMA configuration to massage
  305. * @numa_dist_cnt: The size of the physical NUMA distance table
  306. *
  307. * Emulate NUMA nodes according to the numa=fake kernel parameter.
  308. * @numa_meminfo contains the physical memory configuration and is modified
  309. * to reflect the emulated configuration on success. @numa_dist_cnt is
  310. * used to determine the size of the physical distance table.
  311. *
  312. * On success, the following modifications are made.
  313. *
  314. * - @numa_meminfo is updated to reflect the emulated nodes.
  315. *
  316. * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
  317. * emulated nodes.
  318. *
  319. * - NUMA distance table is rebuilt to represent distances between emulated
  320. * nodes. The distances are determined considering how emulated nodes
  321. * are mapped to physical nodes and match the actual distances.
  322. *
  323. * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
  324. * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
  325. *
  326. * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
  327. * identity mapping and no other modification is made.
  328. */
  329. void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
  330. {
  331. static struct numa_meminfo ei __initdata;
  332. static struct numa_meminfo pi __initdata;
  333. const u64 max_addr = PFN_PHYS(max_pfn);
  334. u8 *phys_dist = NULL;
  335. size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
  336. int max_emu_nid, dfl_phys_nid;
  337. int i, j, ret;
  338. if (!emu_cmdline)
  339. goto no_emu;
  340. memset(&ei, 0, sizeof(ei));
  341. pi = *numa_meminfo;
  342. for (i = 0; i < MAX_NUMNODES; i++)
  343. emu_nid_to_phys[i] = NUMA_NO_NODE;
  344. /*
  345. * If the numa=fake command-line contains a 'M' or 'G', it represents
  346. * the fixed node size. Otherwise, if it is just a single number N,
  347. * split the system RAM into N fake nodes.
  348. */
  349. if (strchr(emu_cmdline, 'U')) {
  350. nodemask_t physnode_mask = numa_nodes_parsed;
  351. unsigned long n;
  352. int nid = 0;
  353. n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
  354. ret = -1;
  355. for_each_node_mask(i, physnode_mask) {
  356. /*
  357. * The reason we pass in blk[0] is due to
  358. * numa_remove_memblk_from() called by
  359. * emu_setup_memblk() will delete entry 0
  360. * and then move everything else up in the pi.blk
  361. * array. Therefore we should always be looking
  362. * at blk[0].
  363. */
  364. ret = split_nodes_size_interleave_uniform(&ei, &pi,
  365. pi.blk[0].start, pi.blk[0].end, 0,
  366. n, &pi.blk[0], nid);
  367. if (ret < 0)
  368. break;
  369. if (ret < n) {
  370. pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
  371. __func__, i, ret, n);
  372. ret = -1;
  373. break;
  374. }
  375. nid = ret;
  376. }
  377. } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
  378. u64 size;
  379. size = memparse(emu_cmdline, &emu_cmdline);
  380. ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
  381. } else {
  382. unsigned long n;
  383. n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
  384. ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
  385. }
  386. if (*emu_cmdline == ':')
  387. emu_cmdline++;
  388. if (ret < 0)
  389. goto no_emu;
  390. if (numa_cleanup_meminfo(&ei) < 0) {
  391. pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
  392. goto no_emu;
  393. }
  394. /* copy the physical distance table */
  395. if (numa_dist_cnt) {
  396. phys_dist = memblock_alloc(phys_size, PAGE_SIZE);
  397. if (!phys_dist) {
  398. pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
  399. goto no_emu;
  400. }
  401. for (i = 0; i < numa_dist_cnt; i++)
  402. for (j = 0; j < numa_dist_cnt; j++)
  403. phys_dist[i * numa_dist_cnt + j] =
  404. node_distance(i, j);
  405. }
  406. /*
  407. * Determine the max emulated nid and the default phys nid to use
  408. * for unmapped nodes.
  409. */
  410. max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
  411. /* commit */
  412. *numa_meminfo = ei;
  413. /* Make sure numa_nodes_parsed only contains emulated nodes */
  414. nodes_clear(numa_nodes_parsed);
  415. for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
  416. if (ei.blk[i].start != ei.blk[i].end &&
  417. ei.blk[i].nid != NUMA_NO_NODE)
  418. node_set(ei.blk[i].nid, numa_nodes_parsed);
  419. numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
  420. /* make sure all emulated nodes are mapped to a physical node */
  421. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
  422. if (emu_nid_to_phys[i] == NUMA_NO_NODE)
  423. emu_nid_to_phys[i] = dfl_phys_nid;
  424. /* transform distance table */
  425. numa_reset_distance();
  426. for (i = 0; i < max_emu_nid + 1; i++) {
  427. for (j = 0; j < max_emu_nid + 1; j++) {
  428. int physi = emu_nid_to_phys[i];
  429. int physj = emu_nid_to_phys[j];
  430. int dist;
  431. if (get_option(&emu_cmdline, &dist) == 2)
  432. ;
  433. else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
  434. dist = physi == physj ?
  435. LOCAL_DISTANCE : REMOTE_DISTANCE;
  436. else
  437. dist = phys_dist[physi * numa_dist_cnt + physj];
  438. numa_set_distance(i, j, dist);
  439. }
  440. }
  441. /* free the copied physical distance table */
  442. memblock_free(phys_dist, phys_size);
  443. return;
  444. no_emu:
  445. /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
  446. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
  447. emu_nid_to_phys[i] = i;
  448. }
  449. #ifndef CONFIG_DEBUG_PER_CPU_MAPS
  450. void numa_add_cpu(unsigned int cpu)
  451. {
  452. int physnid, nid;
  453. nid = early_cpu_to_node(cpu);
  454. BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
  455. physnid = emu_nid_to_phys[nid];
  456. /*
  457. * Map the cpu to each emulated node that is allocated on the physical
  458. * node of the cpu's apic id.
  459. */
  460. for_each_online_node(nid)
  461. if (emu_nid_to_phys[nid] == physnid)
  462. cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
  463. }
  464. void numa_remove_cpu(unsigned int cpu)
  465. {
  466. int i;
  467. for_each_online_node(i)
  468. cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
  469. }
  470. #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
  471. static void numa_set_cpumask(unsigned int cpu, bool enable)
  472. {
  473. int nid, physnid;
  474. nid = early_cpu_to_node(cpu);
  475. if (nid == NUMA_NO_NODE) {
  476. /* early_cpu_to_node() already emits a warning and trace */
  477. return;
  478. }
  479. physnid = emu_nid_to_phys[nid];
  480. for_each_online_node(nid) {
  481. if (emu_nid_to_phys[nid] != physnid)
  482. continue;
  483. debug_cpumask_set_cpu(cpu, nid, enable);
  484. }
  485. }
  486. void numa_add_cpu(unsigned int cpu)
  487. {
  488. numa_set_cpumask(cpu, true);
  489. }
  490. void numa_remove_cpu(unsigned int cpu)
  491. {
  492. numa_set_cpumask(cpu, false);
  493. }
  494. #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */