arch_numa.c 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * NUMA support, based on the x86 implementation.
  4. *
  5. * Copyright (C) 2015 Cavium Inc.
  6. * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
  7. */
  8. #define pr_fmt(fmt) "NUMA: " fmt
  9. #include <linux/acpi.h>
  10. #include <linux/memblock.h>
  11. #include <linux/module.h>
  12. #include <linux/of.h>
  13. #include <linux/numa_memblks.h>
  14. #include <asm/sections.h>
  15. static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
  16. bool numa_off;
  17. static __init int numa_parse_early_param(char *opt)
  18. {
  19. if (!opt)
  20. return -EINVAL;
  21. if (str_has_prefix(opt, "off"))
  22. numa_off = true;
  23. if (!strncmp(opt, "fake=", 5))
  24. return numa_emu_cmdline(opt + 5);
  25. return 0;
  26. }
  27. early_param("numa", numa_parse_early_param);
  28. cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  29. EXPORT_SYMBOL(node_to_cpumask_map);
  30. #ifdef CONFIG_DEBUG_PER_CPU_MAPS
  31. /*
  32. * Returns a pointer to the bitmask of CPUs on Node 'node'.
  33. */
  34. const struct cpumask *cpumask_of_node(int node)
  35. {
  36. if (node == NUMA_NO_NODE)
  37. return cpu_all_mask;
  38. if (WARN_ON(node < 0 || node >= nr_node_ids))
  39. return cpu_none_mask;
  40. if (WARN_ON(node_to_cpumask_map[node] == NULL))
  41. return cpu_online_mask;
  42. return node_to_cpumask_map[node];
  43. }
  44. EXPORT_SYMBOL(cpumask_of_node);
  45. #endif
  46. #ifndef CONFIG_NUMA_EMU
  47. static void numa_update_cpu(unsigned int cpu, bool remove)
  48. {
  49. int nid = cpu_to_node(cpu);
  50. if (nid == NUMA_NO_NODE)
  51. return;
  52. if (remove)
  53. cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
  54. else
  55. cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
  56. }
  57. void numa_add_cpu(unsigned int cpu)
  58. {
  59. numa_update_cpu(cpu, false);
  60. }
  61. void numa_remove_cpu(unsigned int cpu)
  62. {
  63. numa_update_cpu(cpu, true);
  64. }
  65. #endif
  66. void numa_clear_node(unsigned int cpu)
  67. {
  68. numa_remove_cpu(cpu);
  69. set_cpu_numa_node(cpu, NUMA_NO_NODE);
  70. }
  71. /*
  72. * Allocate node_to_cpumask_map based on number of available nodes
  73. * Requires node_possible_map to be valid.
  74. *
  75. * Note: cpumask_of_node() is not valid until after this is done.
  76. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  77. */
  78. static void __init setup_node_to_cpumask_map(void)
  79. {
  80. int node;
  81. /* setup nr_node_ids if not done yet */
  82. if (nr_node_ids == MAX_NUMNODES)
  83. setup_nr_node_ids();
  84. /* allocate and clear the mapping */
  85. for (node = 0; node < nr_node_ids; node++) {
  86. alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  87. cpumask_clear(node_to_cpumask_map[node]);
  88. }
  89. /* cpumask_of_node() will now work */
  90. pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
  91. }
  92. /*
  93. * Set the cpu to node and mem mapping
  94. */
  95. void numa_store_cpu_info(unsigned int cpu)
  96. {
  97. set_cpu_numa_node(cpu, cpu_to_node_map[cpu]);
  98. }
  99. void __init early_map_cpu_to_node(unsigned int cpu, int nid)
  100. {
  101. /* fallback to node 0 */
  102. if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
  103. nid = 0;
  104. cpu_to_node_map[cpu] = nid;
  105. /*
  106. * We should set the numa node of cpu0 as soon as possible, because it
  107. * has already been set up online before. cpu_to_node(0) will soon be
  108. * called.
  109. */
  110. if (!cpu)
  111. set_cpu_numa_node(cpu, nid);
  112. }
  113. #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
  114. unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  115. EXPORT_SYMBOL(__per_cpu_offset);
  116. int early_cpu_to_node(int cpu)
  117. {
  118. return cpu_to_node_map[cpu];
  119. }
  120. static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
  121. {
  122. return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
  123. }
  124. void __init setup_per_cpu_areas(void)
  125. {
  126. unsigned long delta;
  127. unsigned int cpu;
  128. int rc = -EINVAL;
  129. if (pcpu_chosen_fc != PCPU_FC_PAGE) {
  130. /*
  131. * Always reserve area for module percpu variables. That's
  132. * what the legacy allocator did.
  133. */
  134. rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
  135. PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
  136. pcpu_cpu_distance,
  137. early_cpu_to_node);
  138. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  139. if (rc < 0)
  140. pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
  141. pcpu_fc_names[pcpu_chosen_fc], rc);
  142. #endif
  143. }
  144. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  145. if (rc < 0)
  146. rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node);
  147. #endif
  148. if (rc < 0)
  149. panic("Failed to initialize percpu areas (err=%d).", rc);
  150. delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  151. for_each_possible_cpu(cpu)
  152. __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
  153. }
  154. #endif
  155. /*
  156. * Initialize NODE_DATA for a node on the local memory
  157. */
  158. static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
  159. {
  160. if (start_pfn >= end_pfn)
  161. pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
  162. alloc_node_data(nid);
  163. NODE_DATA(nid)->node_id = nid;
  164. NODE_DATA(nid)->node_start_pfn = start_pfn;
  165. NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
  166. }
  167. static int __init numa_register_nodes(void)
  168. {
  169. int nid;
  170. /* Check the validity of the memblock/node mapping */
  171. if (!memblock_validate_numa_coverage(0))
  172. return -EINVAL;
  173. /* Finally register nodes. */
  174. for_each_node_mask(nid, numa_nodes_parsed) {
  175. unsigned long start_pfn, end_pfn;
  176. get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  177. setup_node_data(nid, start_pfn, end_pfn);
  178. node_set_online(nid);
  179. }
  180. /* Setup online nodes to actual nodes*/
  181. node_possible_map = numa_nodes_parsed;
  182. return 0;
  183. }
  184. static int __init numa_init(int (*init_func)(void))
  185. {
  186. int ret;
  187. nodes_clear(numa_nodes_parsed);
  188. nodes_clear(node_possible_map);
  189. nodes_clear(node_online_map);
  190. ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false);
  191. if (ret < 0)
  192. goto out_free_distance;
  193. if (nodes_empty(numa_nodes_parsed)) {
  194. pr_info("No NUMA configuration found\n");
  195. ret = -EINVAL;
  196. goto out_free_distance;
  197. }
  198. ret = numa_register_nodes();
  199. if (ret < 0)
  200. goto out_free_distance;
  201. setup_node_to_cpumask_map();
  202. return 0;
  203. out_free_distance:
  204. numa_reset_distance();
  205. return ret;
  206. }
  207. /**
  208. * dummy_numa_init() - Fallback dummy NUMA init
  209. *
  210. * Used if there's no underlying NUMA architecture, NUMA initialization
  211. * fails, or NUMA is disabled on the command line.
  212. *
  213. * Must online at least one node (node 0) and add memory blocks that cover all
  214. * allowed memory. It is unlikely that this function fails.
  215. *
  216. * Return: 0 on success, -errno on failure.
  217. */
  218. static int __init dummy_numa_init(void)
  219. {
  220. phys_addr_t start = memblock_start_of_DRAM();
  221. phys_addr_t end = memblock_end_of_DRAM() - 1;
  222. int ret;
  223. if (numa_off)
  224. pr_info("NUMA disabled\n"); /* Forced off on command line. */
  225. pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end);
  226. ret = numa_add_memblk(0, start, end + 1);
  227. if (ret) {
  228. pr_err("NUMA init failed\n");
  229. return ret;
  230. }
  231. node_set(0, numa_nodes_parsed);
  232. numa_off = true;
  233. return 0;
  234. }
  235. #ifdef CONFIG_ACPI_NUMA
  236. static int __init arch_acpi_numa_init(void)
  237. {
  238. int ret;
  239. ret = acpi_numa_init();
  240. if (ret) {
  241. pr_debug("Failed to initialise from firmware\n");
  242. return ret;
  243. }
  244. return srat_disabled() ? -EINVAL : 0;
  245. }
  246. #else
  247. static int __init arch_acpi_numa_init(void)
  248. {
  249. return -EOPNOTSUPP;
  250. }
  251. #endif
  252. /**
  253. * arch_numa_init() - Initialize NUMA
  254. *
  255. * Try each configured NUMA initialization method until one succeeds. The
  256. * last fallback is dummy single node config encompassing whole memory.
  257. */
  258. void __init arch_numa_init(void)
  259. {
  260. if (!numa_off) {
  261. if (!acpi_disabled && !numa_init(arch_acpi_numa_init))
  262. return;
  263. if (acpi_disabled && !numa_init(of_numa_init))
  264. return;
  265. }
  266. numa_init(dummy_numa_init);
  267. }
  268. #ifdef CONFIG_NUMA_EMU
  269. void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
  270. unsigned int nr_emu_nids)
  271. {
  272. int i, j;
  273. /*
  274. * Transform cpu_to_node_map table to use emulated nids by
  275. * reverse-mapping phys_nid. The maps should always exist but fall
  276. * back to zero just in case.
  277. */
  278. for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) {
  279. if (cpu_to_node_map[i] == NUMA_NO_NODE)
  280. continue;
  281. for (j = 0; j < nr_emu_nids; j++)
  282. if (cpu_to_node_map[i] == emu_nid_to_phys[j])
  283. break;
  284. cpu_to_node_map[i] = j < nr_emu_nids ? j : 0;
  285. }
  286. }
  287. u64 __init numa_emu_dma_end(void)
  288. {
  289. return memblock_start_of_DRAM() + SZ_4G;
  290. }
  291. void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
  292. {
  293. struct cpumask *mask;
  294. if (node == NUMA_NO_NODE)
  295. return;
  296. mask = node_to_cpumask_map[node];
  297. if (!cpumask_available(mask)) {
  298. pr_err("node_to_cpumask_map[%i] NULL\n", node);
  299. dump_stack();
  300. return;
  301. }
  302. if (enable)
  303. cpumask_set_cpu(cpu, mask);
  304. else
  305. cpumask_clear_cpu(cpu, mask);
  306. pr_debug("%s cpu %d node %d: mask now %*pbl\n",
  307. enable ? "numa_add_cpu" : "numa_remove_cpu",
  308. cpu, node, cpumask_pr_args(mask));
  309. }
  310. #endif /* CONFIG_NUMA_EMU */