init_64.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/arch/x86_64/mm/init.c
  4. *
  5. * Copyright (C) 1995 Linus Torvalds
  6. * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
  7. * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
  8. */
  9. #include <linux/signal.h>
  10. #include <linux/sched.h>
  11. #include <linux/kernel.h>
  12. #include <linux/errno.h>
  13. #include <linux/string.h>
  14. #include <linux/types.h>
  15. #include <linux/ptrace.h>
  16. #include <linux/mman.h>
  17. #include <linux/mm.h>
  18. #include <linux/swap.h>
  19. #include <linux/smp.h>
  20. #include <linux/init.h>
  21. #include <linux/initrd.h>
  22. #include <linux/pagemap.h>
  23. #include <linux/memblock.h>
  24. #include <linux/proc_fs.h>
  25. #include <linux/pci.h>
  26. #include <linux/pfn.h>
  27. #include <linux/poison.h>
  28. #include <linux/dma-mapping.h>
  29. #include <linux/memory.h>
  30. #include <linux/memory_hotplug.h>
  31. #include <linux/memremap.h>
  32. #include <linux/nmi.h>
  33. #include <linux/gfp.h>
  34. #include <linux/kcore.h>
  35. #include <linux/bootmem_info.h>
  36. #include <asm/processor.h>
  37. #include <asm/bios_ebda.h>
  38. #include <linux/uaccess.h>
  39. #include <asm/pgalloc.h>
  40. #include <asm/dma.h>
  41. #include <asm/fixmap.h>
  42. #include <asm/e820/api.h>
  43. #include <asm/apic.h>
  44. #include <asm/tlb.h>
  45. #include <asm/mmu_context.h>
  46. #include <asm/proto.h>
  47. #include <asm/smp.h>
  48. #include <asm/sections.h>
  49. #include <asm/kdebug.h>
  50. #include <asm/numa.h>
  51. #include <asm/set_memory.h>
  52. #include <asm/init.h>
  53. #include <asm/uv/uv.h>
  54. #include <asm/setup.h>
  55. #include <asm/ftrace.h>
  56. #include "mm_internal.h"
  57. #include "ident_map.c"
  58. #define DEFINE_POPULATE(fname, type1, type2, init) \
  59. static inline void fname##_init(struct mm_struct *mm, \
  60. type1##_t *arg1, type2##_t *arg2, bool init) \
  61. { \
  62. if (init) \
  63. fname##_safe(mm, arg1, arg2); \
  64. else \
  65. fname(mm, arg1, arg2); \
  66. }
  67. DEFINE_POPULATE(p4d_populate, p4d, pud, init)
  68. DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
  69. DEFINE_POPULATE(pud_populate, pud, pmd, init)
  70. DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)
  71. #define DEFINE_ENTRY(type1, type2, init) \
  72. static inline void set_##type1##_init(type1##_t *arg1, \
  73. type2##_t arg2, bool init) \
  74. { \
  75. if (init) \
  76. set_##type1##_safe(arg1, arg2); \
  77. else \
  78. set_##type1(arg1, arg2); \
  79. }
  80. DEFINE_ENTRY(p4d, p4d, init)
  81. DEFINE_ENTRY(pud, pud, init)
  82. DEFINE_ENTRY(pmd, pmd, init)
  83. DEFINE_ENTRY(pte, pte, init)
  84. static inline pgprot_t prot_sethuge(pgprot_t prot)
  85. {
  86. WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PAT);
  87. return __pgprot(pgprot_val(prot) | _PAGE_PSE);
  88. }
  89. /*
  90. * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  91. * physical space so we can cache the place of the first one and move
  92. * around without checking the pgd every time.
  93. */
  94. /* Bits supported by the hardware: */
  95. pteval_t __supported_pte_mask __read_mostly = ~0;
  96. /* Bits allowed in normal kernel mappings: */
  97. pteval_t __default_kernel_pte_mask __read_mostly = ~0;
  98. EXPORT_SYMBOL_GPL(__supported_pte_mask);
  99. /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
  100. EXPORT_SYMBOL(__default_kernel_pte_mask);
  101. int force_personality32;
  102. /*
  103. * noexec32=on|off
  104. * Control non executable heap for 32bit processes.
  105. *
  106. * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
  107. * off PROT_READ implies PROT_EXEC
  108. */
  109. static int __init nonx32_setup(char *str)
  110. {
  111. if (!strcmp(str, "on"))
  112. force_personality32 &= ~READ_IMPLIES_EXEC;
  113. else if (!strcmp(str, "off"))
  114. force_personality32 |= READ_IMPLIES_EXEC;
  115. return 1;
  116. }
  117. __setup("noexec32=", nonx32_setup);
  118. static void sync_global_pgds_l5(unsigned long start, unsigned long end)
  119. {
  120. unsigned long addr;
  121. for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
  122. const pgd_t *pgd_ref = pgd_offset_k(addr);
  123. struct page *page;
  124. /* Check for overflow */
  125. if (addr < start)
  126. break;
  127. if (pgd_none(*pgd_ref))
  128. continue;
  129. spin_lock(&pgd_lock);
  130. list_for_each_entry(page, &pgd_list, lru) {
  131. pgd_t *pgd;
  132. spinlock_t *pgt_lock;
  133. pgd = (pgd_t *)page_address(page) + pgd_index(addr);
  134. /* the pgt_lock only for Xen */
  135. pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  136. spin_lock(pgt_lock);
  137. if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
  138. BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
  139. if (pgd_none(*pgd))
  140. set_pgd(pgd, *pgd_ref);
  141. spin_unlock(pgt_lock);
  142. }
  143. spin_unlock(&pgd_lock);
  144. }
  145. }
  146. static void sync_global_pgds_l4(unsigned long start, unsigned long end)
  147. {
  148. unsigned long addr;
  149. for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
  150. pgd_t *pgd_ref = pgd_offset_k(addr);
  151. const p4d_t *p4d_ref;
  152. struct page *page;
  153. /*
  154. * With folded p4d, pgd_none() is always false, we need to
  155. * handle synchronization on p4d level.
  156. */
  157. MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
  158. p4d_ref = p4d_offset(pgd_ref, addr);
  159. if (p4d_none(*p4d_ref))
  160. continue;
  161. spin_lock(&pgd_lock);
  162. list_for_each_entry(page, &pgd_list, lru) {
  163. pgd_t *pgd;
  164. p4d_t *p4d;
  165. spinlock_t *pgt_lock;
  166. pgd = (pgd_t *)page_address(page) + pgd_index(addr);
  167. p4d = p4d_offset(pgd, addr);
  168. /* the pgt_lock only for Xen */
  169. pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  170. spin_lock(pgt_lock);
  171. if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
  172. BUG_ON(p4d_pgtable(*p4d)
  173. != p4d_pgtable(*p4d_ref));
  174. if (p4d_none(*p4d))
  175. set_p4d(p4d, *p4d_ref);
  176. spin_unlock(pgt_lock);
  177. }
  178. spin_unlock(&pgd_lock);
  179. }
  180. }
  181. /*
  182. * When memory was added make sure all the processes MM have
  183. * suitable PGD entries in the local PGD level page.
  184. */
  185. static void sync_global_pgds(unsigned long start, unsigned long end)
  186. {
  187. if (pgtable_l5_enabled())
  188. sync_global_pgds_l5(start, end);
  189. else
  190. sync_global_pgds_l4(start, end);
  191. }
  192. /*
  193. * NOTE: This function is marked __ref because it calls __init function
  194. * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
  195. */
  196. static __ref void *spp_getpage(void)
  197. {
  198. void *ptr;
  199. if (after_bootmem)
  200. ptr = (void *) get_zeroed_page(GFP_ATOMIC);
  201. else
  202. ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
  203. if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
  204. panic("set_pte_phys: cannot allocate page data %s\n",
  205. after_bootmem ? "after bootmem" : "");
  206. }
  207. pr_debug("spp_getpage %p\n", ptr);
  208. return ptr;
  209. }
  210. static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
  211. {
  212. if (pgd_none(*pgd)) {
  213. p4d_t *p4d = (p4d_t *)spp_getpage();
  214. pgd_populate(&init_mm, pgd, p4d);
  215. if (p4d != p4d_offset(pgd, 0))
  216. printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
  217. p4d, p4d_offset(pgd, 0));
  218. }
  219. return p4d_offset(pgd, vaddr);
  220. }
  221. static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
  222. {
  223. if (p4d_none(*p4d)) {
  224. pud_t *pud = (pud_t *)spp_getpage();
  225. p4d_populate(&init_mm, p4d, pud);
  226. if (pud != pud_offset(p4d, 0))
  227. printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
  228. pud, pud_offset(p4d, 0));
  229. }
  230. return pud_offset(p4d, vaddr);
  231. }
  232. static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
  233. {
  234. if (pud_none(*pud)) {
  235. pmd_t *pmd = (pmd_t *) spp_getpage();
  236. pud_populate(&init_mm, pud, pmd);
  237. if (pmd != pmd_offset(pud, 0))
  238. printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
  239. pmd, pmd_offset(pud, 0));
  240. }
  241. return pmd_offset(pud, vaddr);
  242. }
  243. static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
  244. {
  245. if (pmd_none(*pmd)) {
  246. pte_t *pte = (pte_t *) spp_getpage();
  247. pmd_populate_kernel(&init_mm, pmd, pte);
  248. if (pte != pte_offset_kernel(pmd, 0))
  249. printk(KERN_ERR "PAGETABLE BUG #03!\n");
  250. }
  251. return pte_offset_kernel(pmd, vaddr);
  252. }
  253. static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
  254. {
  255. pmd_t *pmd = fill_pmd(pud, vaddr);
  256. pte_t *pte = fill_pte(pmd, vaddr);
  257. set_pte(pte, new_pte);
  258. /*
  259. * It's enough to flush this one mapping.
  260. * (PGE mappings get flushed as well)
  261. */
  262. flush_tlb_one_kernel(vaddr);
  263. }
  264. void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
  265. {
  266. p4d_t *p4d = p4d_page + p4d_index(vaddr);
  267. pud_t *pud = fill_pud(p4d, vaddr);
  268. __set_pte_vaddr(pud, vaddr, new_pte);
  269. }
  270. void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
  271. {
  272. pud_t *pud = pud_page + pud_index(vaddr);
  273. __set_pte_vaddr(pud, vaddr, new_pte);
  274. }
  275. void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
  276. {
  277. pgd_t *pgd;
  278. p4d_t *p4d_page;
  279. pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
  280. pgd = pgd_offset_k(vaddr);
  281. if (pgd_none(*pgd)) {
  282. printk(KERN_ERR
  283. "PGD FIXMAP MISSING, it should be setup in head.S!\n");
  284. return;
  285. }
  286. p4d_page = p4d_offset(pgd, 0);
  287. set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
  288. }
  289. pmd_t * __init populate_extra_pmd(unsigned long vaddr)
  290. {
  291. pgd_t *pgd;
  292. p4d_t *p4d;
  293. pud_t *pud;
  294. pgd = pgd_offset_k(vaddr);
  295. p4d = fill_p4d(pgd, vaddr);
  296. pud = fill_pud(p4d, vaddr);
  297. return fill_pmd(pud, vaddr);
  298. }
  299. pte_t * __init populate_extra_pte(unsigned long vaddr)
  300. {
  301. pmd_t *pmd;
  302. pmd = populate_extra_pmd(vaddr);
  303. return fill_pte(pmd, vaddr);
  304. }
  305. /*
  306. * Create large page table mappings for a range of physical addresses.
  307. */
  308. static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
  309. enum page_cache_mode cache)
  310. {
  311. pgd_t *pgd;
  312. p4d_t *p4d;
  313. pud_t *pud;
  314. pmd_t *pmd;
  315. pgprot_t prot;
  316. pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
  317. protval_4k_2_large(cachemode2protval(cache));
  318. BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
  319. for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
  320. pgd = pgd_offset_k((unsigned long)__va(phys));
  321. if (pgd_none(*pgd)) {
  322. p4d = (p4d_t *) spp_getpage();
  323. set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
  324. _PAGE_USER));
  325. }
  326. p4d = p4d_offset(pgd, (unsigned long)__va(phys));
  327. if (p4d_none(*p4d)) {
  328. pud = (pud_t *) spp_getpage();
  329. set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
  330. _PAGE_USER));
  331. }
  332. pud = pud_offset(p4d, (unsigned long)__va(phys));
  333. if (pud_none(*pud)) {
  334. pmd = (pmd_t *) spp_getpage();
  335. set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
  336. _PAGE_USER));
  337. }
  338. pmd = pmd_offset(pud, phys);
  339. BUG_ON(!pmd_none(*pmd));
  340. set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
  341. }
  342. }
  343. void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
  344. {
  345. __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
  346. }
  347. void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
  348. {
  349. __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
  350. }
  351. /*
  352. * The head.S code sets up the kernel high mapping:
  353. *
  354. * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
  355. *
  356. * phys_base holds the negative offset to the kernel, which is added
  357. * to the compile time generated pmds. This results in invalid pmds up
  358. * to the point where we hit the physaddr 0 mapping.
  359. *
  360. * We limit the mappings to the region from _text to _brk_end. _brk_end
  361. * is rounded up to the 2MB boundary. This catches the invalid pmds as
  362. * well, as they are located before _text:
  363. */
  364. void __init cleanup_highmap(void)
  365. {
  366. unsigned long vaddr = __START_KERNEL_map;
  367. unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
  368. unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
  369. pmd_t *pmd = level2_kernel_pgt;
  370. /*
  371. * Native path, max_pfn_mapped is not set yet.
  372. * Xen has valid max_pfn_mapped set in
  373. * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
  374. */
  375. if (max_pfn_mapped)
  376. vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
  377. for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
  378. if (pmd_none(*pmd))
  379. continue;
  380. if (vaddr < (unsigned long) _text || vaddr > end)
  381. set_pmd(pmd, __pmd(0));
  382. }
  383. }
  384. /*
  385. * Create PTE level page table mapping for physical addresses.
  386. * It returns the last physical address mapped.
  387. */
  388. static unsigned long __meminit
  389. phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
  390. pgprot_t prot, bool init)
  391. {
  392. unsigned long pages = 0, paddr_next;
  393. unsigned long paddr_last = paddr_end;
  394. pte_t *pte;
  395. int i;
  396. pte = pte_page + pte_index(paddr);
  397. i = pte_index(paddr);
  398. for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
  399. paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
  400. if (paddr >= paddr_end) {
  401. if (!after_bootmem &&
  402. !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
  403. E820_TYPE_RAM) &&
  404. !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
  405. E820_TYPE_RESERVED_KERN) &&
  406. !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
  407. E820_TYPE_ACPI))
  408. set_pte_init(pte, __pte(0), init);
  409. continue;
  410. }
  411. /*
  412. * We will re-use the existing mapping.
  413. * Xen for example has some special requirements, like mapping
  414. * pagetable pages as RO. So assume someone who pre-setup
  415. * these mappings are more intelligent.
  416. */
  417. if (!pte_none(*pte)) {
  418. if (!after_bootmem)
  419. pages++;
  420. continue;
  421. }
  422. if (0)
  423. pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
  424. pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
  425. pages++;
  426. set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
  427. paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
  428. }
  429. update_page_count(PG_LEVEL_4K, pages);
  430. return paddr_last;
  431. }
  432. /*
  433. * Create PMD level page table mapping for physical addresses. The virtual
  434. * and physical address have to be aligned at this level.
  435. * It returns the last physical address mapped.
  436. */
  437. static unsigned long __meminit
  438. phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
  439. unsigned long page_size_mask, pgprot_t prot, bool init)
  440. {
  441. unsigned long pages = 0, paddr_next;
  442. unsigned long paddr_last = paddr_end;
  443. int i = pmd_index(paddr);
  444. for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
  445. pmd_t *pmd = pmd_page + pmd_index(paddr);
  446. pte_t *pte;
  447. pgprot_t new_prot = prot;
  448. paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
  449. if (paddr >= paddr_end) {
  450. if (!after_bootmem &&
  451. !e820__mapped_any(paddr & PMD_MASK, paddr_next,
  452. E820_TYPE_RAM) &&
  453. !e820__mapped_any(paddr & PMD_MASK, paddr_next,
  454. E820_TYPE_RESERVED_KERN) &&
  455. !e820__mapped_any(paddr & PMD_MASK, paddr_next,
  456. E820_TYPE_ACPI))
  457. set_pmd_init(pmd, __pmd(0), init);
  458. continue;
  459. }
  460. if (!pmd_none(*pmd)) {
  461. if (!pmd_leaf(*pmd)) {
  462. spin_lock(&init_mm.page_table_lock);
  463. pte = (pte_t *)pmd_page_vaddr(*pmd);
  464. paddr_last = phys_pte_init(pte, paddr,
  465. paddr_end, prot,
  466. init);
  467. spin_unlock(&init_mm.page_table_lock);
  468. continue;
  469. }
  470. /*
  471. * If we are ok with PG_LEVEL_2M mapping, then we will
  472. * use the existing mapping,
  473. *
  474. * Otherwise, we will split the large page mapping but
  475. * use the same existing protection bits except for
  476. * large page, so that we don't violate Intel's TLB
  477. * Application note (317080) which says, while changing
  478. * the page sizes, new and old translations should
  479. * not differ with respect to page frame and
  480. * attributes.
  481. */
  482. if (page_size_mask & (1 << PG_LEVEL_2M)) {
  483. if (!after_bootmem)
  484. pages++;
  485. paddr_last = paddr_next;
  486. continue;
  487. }
  488. new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
  489. }
  490. if (page_size_mask & (1<<PG_LEVEL_2M)) {
  491. pages++;
  492. spin_lock(&init_mm.page_table_lock);
  493. set_pmd_init(pmd,
  494. pfn_pmd(paddr >> PAGE_SHIFT, prot_sethuge(prot)),
  495. init);
  496. spin_unlock(&init_mm.page_table_lock);
  497. paddr_last = paddr_next;
  498. continue;
  499. }
  500. pte = alloc_low_page();
  501. paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);
  502. spin_lock(&init_mm.page_table_lock);
  503. pmd_populate_kernel_init(&init_mm, pmd, pte, init);
  504. spin_unlock(&init_mm.page_table_lock);
  505. }
  506. update_page_count(PG_LEVEL_2M, pages);
  507. return paddr_last;
  508. }
  509. /*
  510. * Create PUD level page table mapping for physical addresses. The virtual
  511. * and physical address do not have to be aligned at this level. KASLR can
  512. * randomize virtual addresses up to this level.
  513. * It returns the last physical address mapped.
  514. */
  515. static unsigned long __meminit
  516. phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
  517. unsigned long page_size_mask, pgprot_t _prot, bool init)
  518. {
  519. unsigned long pages = 0, paddr_next;
  520. unsigned long paddr_last = paddr_end;
  521. unsigned long vaddr = (unsigned long)__va(paddr);
  522. int i = pud_index(vaddr);
  523. for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
  524. pud_t *pud;
  525. pmd_t *pmd;
  526. pgprot_t prot = _prot;
  527. vaddr = (unsigned long)__va(paddr);
  528. pud = pud_page + pud_index(vaddr);
  529. paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
  530. if (paddr >= paddr_end) {
  531. if (!after_bootmem &&
  532. !e820__mapped_any(paddr & PUD_MASK, paddr_next,
  533. E820_TYPE_RAM) &&
  534. !e820__mapped_any(paddr & PUD_MASK, paddr_next,
  535. E820_TYPE_RESERVED_KERN) &&
  536. !e820__mapped_any(paddr & PUD_MASK, paddr_next,
  537. E820_TYPE_ACPI))
  538. set_pud_init(pud, __pud(0), init);
  539. continue;
  540. }
  541. if (!pud_none(*pud)) {
  542. if (!pud_leaf(*pud)) {
  543. pmd = pmd_offset(pud, 0);
  544. paddr_last = phys_pmd_init(pmd, paddr,
  545. paddr_end,
  546. page_size_mask,
  547. prot, init);
  548. continue;
  549. }
  550. /*
  551. * If we are ok with PG_LEVEL_1G mapping, then we will
  552. * use the existing mapping.
  553. *
  554. * Otherwise, we will split the gbpage mapping but use
  555. * the same existing protection bits except for large
  556. * page, so that we don't violate Intel's TLB
  557. * Application note (317080) which says, while changing
  558. * the page sizes, new and old translations should
  559. * not differ with respect to page frame and
  560. * attributes.
  561. */
  562. if (page_size_mask & (1 << PG_LEVEL_1G)) {
  563. if (!after_bootmem)
  564. pages++;
  565. paddr_last = paddr_next;
  566. continue;
  567. }
  568. prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
  569. }
  570. if (page_size_mask & (1<<PG_LEVEL_1G)) {
  571. pages++;
  572. spin_lock(&init_mm.page_table_lock);
  573. set_pud_init(pud,
  574. pfn_pud(paddr >> PAGE_SHIFT, prot_sethuge(prot)),
  575. init);
  576. spin_unlock(&init_mm.page_table_lock);
  577. paddr_last = paddr_next;
  578. continue;
  579. }
  580. pmd = alloc_low_page();
  581. paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
  582. page_size_mask, prot, init);
  583. spin_lock(&init_mm.page_table_lock);
  584. pud_populate_init(&init_mm, pud, pmd, init);
  585. spin_unlock(&init_mm.page_table_lock);
  586. }
  587. update_page_count(PG_LEVEL_1G, pages);
  588. return paddr_last;
  589. }
  590. static unsigned long __meminit
  591. phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
  592. unsigned long page_size_mask, pgprot_t prot, bool init)
  593. {
  594. unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
  595. paddr_last = paddr_end;
  596. vaddr = (unsigned long)__va(paddr);
  597. vaddr_end = (unsigned long)__va(paddr_end);
  598. if (!pgtable_l5_enabled())
  599. return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
  600. page_size_mask, prot, init);
  601. for (; vaddr < vaddr_end; vaddr = vaddr_next) {
  602. p4d_t *p4d = p4d_page + p4d_index(vaddr);
  603. pud_t *pud;
  604. vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE;
  605. paddr = __pa(vaddr);
  606. if (paddr >= paddr_end) {
  607. paddr_next = __pa(vaddr_next);
  608. if (!after_bootmem &&
  609. !e820__mapped_any(paddr & P4D_MASK, paddr_next,
  610. E820_TYPE_RAM) &&
  611. !e820__mapped_any(paddr & P4D_MASK, paddr_next,
  612. E820_TYPE_RESERVED_KERN) &&
  613. !e820__mapped_any(paddr & P4D_MASK, paddr_next,
  614. E820_TYPE_ACPI))
  615. set_p4d_init(p4d, __p4d(0), init);
  616. continue;
  617. }
  618. if (!p4d_none(*p4d)) {
  619. pud = pud_offset(p4d, 0);
  620. paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
  621. page_size_mask, prot, init);
  622. continue;
  623. }
  624. pud = alloc_low_page();
  625. paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
  626. page_size_mask, prot, init);
  627. spin_lock(&init_mm.page_table_lock);
  628. p4d_populate_init(&init_mm, p4d, pud, init);
  629. spin_unlock(&init_mm.page_table_lock);
  630. }
  631. return paddr_last;
  632. }
  633. static unsigned long __meminit
  634. __kernel_physical_mapping_init(unsigned long paddr_start,
  635. unsigned long paddr_end,
  636. unsigned long page_size_mask,
  637. pgprot_t prot, bool init)
  638. {
  639. bool pgd_changed = false;
  640. unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
  641. paddr_last = paddr_end;
  642. vaddr = (unsigned long)__va(paddr_start);
  643. vaddr_end = (unsigned long)__va(paddr_end);
  644. vaddr_start = vaddr;
  645. for (; vaddr < vaddr_end; vaddr = vaddr_next) {
  646. pgd_t *pgd = pgd_offset_k(vaddr);
  647. p4d_t *p4d;
  648. vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
  649. if (pgd_val(*pgd)) {
  650. p4d = (p4d_t *)pgd_page_vaddr(*pgd);
  651. paddr_last = phys_p4d_init(p4d, __pa(vaddr),
  652. __pa(vaddr_end),
  653. page_size_mask,
  654. prot, init);
  655. continue;
  656. }
  657. p4d = alloc_low_page();
  658. paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
  659. page_size_mask, prot, init);
  660. spin_lock(&init_mm.page_table_lock);
  661. if (pgtable_l5_enabled())
  662. pgd_populate_init(&init_mm, pgd, p4d, init);
  663. else
  664. p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
  665. (pud_t *) p4d, init);
  666. spin_unlock(&init_mm.page_table_lock);
  667. pgd_changed = true;
  668. }
  669. if (pgd_changed)
  670. sync_global_pgds(vaddr_start, vaddr_end - 1);
  671. return paddr_last;
  672. }
  673. /*
  674. * Create page table mapping for the physical memory for specific physical
  675. * addresses. Note that it can only be used to populate non-present entries.
  676. * The virtual and physical addresses have to be aligned on PMD level
  677. * down. It returns the last physical address mapped.
  678. */
  679. unsigned long __meminit
  680. kernel_physical_mapping_init(unsigned long paddr_start,
  681. unsigned long paddr_end,
  682. unsigned long page_size_mask, pgprot_t prot)
  683. {
  684. return __kernel_physical_mapping_init(paddr_start, paddr_end,
  685. page_size_mask, prot, true);
  686. }
  687. /*
  688. * This function is similar to kernel_physical_mapping_init() above with the
  689. * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
  690. * when updating the mapping. The caller is responsible to flush the TLBs after
  691. * the function returns.
  692. */
  693. unsigned long __meminit
  694. kernel_physical_mapping_change(unsigned long paddr_start,
  695. unsigned long paddr_end,
  696. unsigned long page_size_mask)
  697. {
  698. return __kernel_physical_mapping_init(paddr_start, paddr_end,
  699. page_size_mask, PAGE_KERNEL,
  700. false);
  701. }
  702. #ifndef CONFIG_NUMA
  703. void __init initmem_init(void)
  704. {
  705. memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
  706. }
  707. #endif
  708. void __init paging_init(void)
  709. {
  710. sparse_init();
  711. /*
  712. * clear the default setting with node 0
  713. * note: don't use nodes_clear here, that is really clearing when
  714. * numa support is not compiled in, and later node_set_state
  715. * will not set it back.
  716. */
  717. node_clear_state(0, N_MEMORY);
  718. node_clear_state(0, N_NORMAL_MEMORY);
  719. zone_sizes_init();
  720. }
  721. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  722. #define PAGE_UNUSED 0xFD
  723. /*
  724. * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges
  725. * from unused_pmd_start to next PMD_SIZE boundary.
  726. */
  727. static unsigned long unused_pmd_start __meminitdata;
  728. static void __meminit vmemmap_flush_unused_pmd(void)
  729. {
  730. if (!unused_pmd_start)
  731. return;
  732. /*
  733. * Clears (unused_pmd_start, PMD_END]
  734. */
  735. memset((void *)unused_pmd_start, PAGE_UNUSED,
  736. ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
  737. unused_pmd_start = 0;
  738. }
  739. #ifdef CONFIG_MEMORY_HOTPLUG
  740. /* Returns true if the PMD is completely unused and thus it can be freed */
  741. static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
  742. {
  743. unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
  744. /*
  745. * Flush the unused range cache to ensure that memchr_inv() will work
  746. * for the whole range.
  747. */
  748. vmemmap_flush_unused_pmd();
  749. memset((void *)addr, PAGE_UNUSED, end - addr);
  750. return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
  751. }
  752. #endif
  753. static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
  754. {
  755. /*
  756. * As we expect to add in the same granularity as we remove, it's
  757. * sufficient to mark only some piece used to block the memmap page from
  758. * getting removed when removing some other adjacent memmap (just in
  759. * case the first memmap never gets initialized e.g., because the memory
  760. * block never gets onlined).
  761. */
  762. memset((void *)start, 0, sizeof(struct page));
  763. }
  764. static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
  765. {
  766. /*
  767. * We only optimize if the new used range directly follows the
  768. * previously unused range (esp., when populating consecutive sections).
  769. */
  770. if (unused_pmd_start == start) {
  771. if (likely(IS_ALIGNED(end, PMD_SIZE)))
  772. unused_pmd_start = 0;
  773. else
  774. unused_pmd_start = end;
  775. return;
  776. }
  777. /*
  778. * If the range does not contiguously follows previous one, make sure
  779. * to mark the unused range of the previous one so it can be removed.
  780. */
  781. vmemmap_flush_unused_pmd();
  782. __vmemmap_use_sub_pmd(start);
  783. }
  784. static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
  785. {
  786. const unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
  787. vmemmap_flush_unused_pmd();
  788. /*
  789. * Could be our memmap page is filled with PAGE_UNUSED already from a
  790. * previous remove. Make sure to reset it.
  791. */
  792. __vmemmap_use_sub_pmd(start);
  793. /*
  794. * Mark with PAGE_UNUSED the unused parts of the new memmap range
  795. */
  796. if (!IS_ALIGNED(start, PMD_SIZE))
  797. memset((void *)page, PAGE_UNUSED, start - page);
  798. /*
  799. * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
  800. * consecutive sections. Remember for the last added PMD where the
  801. * unused range begins.
  802. */
  803. if (!IS_ALIGNED(end, PMD_SIZE))
  804. unused_pmd_start = end;
  805. }
  806. #endif
  807. /*
  808. * Memory hotplug specific functions
  809. */
  810. #ifdef CONFIG_MEMORY_HOTPLUG
  811. /*
  812. * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
  813. * updating.
  814. */
  815. static void update_end_of_memory_vars(u64 start, u64 size)
  816. {
  817. unsigned long end_pfn = PFN_UP(start + size);
  818. if (end_pfn > max_pfn) {
  819. max_pfn = end_pfn;
  820. max_low_pfn = end_pfn;
  821. high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
  822. }
  823. }
  824. int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
  825. struct mhp_params *params)
  826. {
  827. unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
  828. int ret;
  829. if (WARN_ON_ONCE(end > PHYSMEM_END))
  830. return -ERANGE;
  831. ret = __add_pages(nid, start_pfn, nr_pages, params);
  832. WARN_ON_ONCE(ret);
  833. /* update max_pfn, max_low_pfn and high_memory */
  834. update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
  835. nr_pages << PAGE_SHIFT);
  836. return ret;
  837. }
  838. int arch_add_memory(int nid, u64 start, u64 size,
  839. struct mhp_params *params)
  840. {
  841. unsigned long start_pfn = start >> PAGE_SHIFT;
  842. unsigned long nr_pages = size >> PAGE_SHIFT;
  843. init_memory_mapping(start, start + size, params->pgprot);
  844. return add_pages(nid, start_pfn, nr_pages, params);
  845. }
  846. static void __meminit free_pagetable(struct page *page, int order)
  847. {
  848. unsigned long magic;
  849. unsigned int nr_pages = 1 << order;
  850. /* bootmem page has reserved flag */
  851. if (PageReserved(page)) {
  852. magic = page->index;
  853. if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
  854. while (nr_pages--)
  855. put_page_bootmem(page++);
  856. } else
  857. while (nr_pages--)
  858. free_reserved_page(page++);
  859. } else
  860. free_pages((unsigned long)page_address(page), order);
  861. }
  862. static void __meminit free_hugepage_table(struct page *page,
  863. struct vmem_altmap *altmap)
  864. {
  865. if (altmap)
  866. vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
  867. else
  868. free_pagetable(page, get_order(PMD_SIZE));
  869. }
  870. static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
  871. {
  872. pte_t *pte;
  873. int i;
  874. for (i = 0; i < PTRS_PER_PTE; i++) {
  875. pte = pte_start + i;
  876. if (!pte_none(*pte))
  877. return;
  878. }
  879. /* free a pte table */
  880. free_pagetable(pmd_page(*pmd), 0);
  881. spin_lock(&init_mm.page_table_lock);
  882. pmd_clear(pmd);
  883. spin_unlock(&init_mm.page_table_lock);
  884. }
  885. static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
  886. {
  887. pmd_t *pmd;
  888. int i;
  889. for (i = 0; i < PTRS_PER_PMD; i++) {
  890. pmd = pmd_start + i;
  891. if (!pmd_none(*pmd))
  892. return;
  893. }
  894. /* free a pmd table */
  895. free_pagetable(pud_page(*pud), 0);
  896. spin_lock(&init_mm.page_table_lock);
  897. pud_clear(pud);
  898. spin_unlock(&init_mm.page_table_lock);
  899. }
  900. static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
  901. {
  902. pud_t *pud;
  903. int i;
  904. for (i = 0; i < PTRS_PER_PUD; i++) {
  905. pud = pud_start + i;
  906. if (!pud_none(*pud))
  907. return;
  908. }
  909. /* free a pud table */
  910. free_pagetable(p4d_page(*p4d), 0);
  911. spin_lock(&init_mm.page_table_lock);
  912. p4d_clear(p4d);
  913. spin_unlock(&init_mm.page_table_lock);
  914. }
  915. static void __meminit
  916. remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
  917. bool direct)
  918. {
  919. unsigned long next, pages = 0;
  920. pte_t *pte;
  921. phys_addr_t phys_addr;
  922. pte = pte_start + pte_index(addr);
  923. for (; addr < end; addr = next, pte++) {
  924. next = (addr + PAGE_SIZE) & PAGE_MASK;
  925. if (next > end)
  926. next = end;
  927. if (!pte_present(*pte))
  928. continue;
  929. /*
  930. * We mapped [0,1G) memory as identity mapping when
  931. * initializing, in arch/x86/kernel/head_64.S. These
  932. * pagetables cannot be removed.
  933. */
  934. phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
  935. if (phys_addr < (phys_addr_t)0x40000000)
  936. return;
  937. if (!direct)
  938. free_pagetable(pte_page(*pte), 0);
  939. spin_lock(&init_mm.page_table_lock);
  940. pte_clear(&init_mm, addr, pte);
  941. spin_unlock(&init_mm.page_table_lock);
  942. /* For non-direct mapping, pages means nothing. */
  943. pages++;
  944. }
  945. /* Call free_pte_table() in remove_pmd_table(). */
  946. flush_tlb_all();
  947. if (direct)
  948. update_page_count(PG_LEVEL_4K, -pages);
  949. }
  950. static void __meminit
  951. remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
  952. bool direct, struct vmem_altmap *altmap)
  953. {
  954. unsigned long next, pages = 0;
  955. pte_t *pte_base;
  956. pmd_t *pmd;
  957. pmd = pmd_start + pmd_index(addr);
  958. for (; addr < end; addr = next, pmd++) {
  959. next = pmd_addr_end(addr, end);
  960. if (!pmd_present(*pmd))
  961. continue;
  962. if (pmd_leaf(*pmd)) {
  963. if (IS_ALIGNED(addr, PMD_SIZE) &&
  964. IS_ALIGNED(next, PMD_SIZE)) {
  965. if (!direct)
  966. free_hugepage_table(pmd_page(*pmd),
  967. altmap);
  968. spin_lock(&init_mm.page_table_lock);
  969. pmd_clear(pmd);
  970. spin_unlock(&init_mm.page_table_lock);
  971. pages++;
  972. }
  973. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  974. else if (vmemmap_pmd_is_unused(addr, next)) {
  975. free_hugepage_table(pmd_page(*pmd),
  976. altmap);
  977. spin_lock(&init_mm.page_table_lock);
  978. pmd_clear(pmd);
  979. spin_unlock(&init_mm.page_table_lock);
  980. }
  981. #endif
  982. continue;
  983. }
  984. pte_base = (pte_t *)pmd_page_vaddr(*pmd);
  985. remove_pte_table(pte_base, addr, next, direct);
  986. free_pte_table(pte_base, pmd);
  987. }
  988. /* Call free_pmd_table() in remove_pud_table(). */
  989. if (direct)
  990. update_page_count(PG_LEVEL_2M, -pages);
  991. }
  992. static void __meminit
  993. remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
  994. struct vmem_altmap *altmap, bool direct)
  995. {
  996. unsigned long next, pages = 0;
  997. pmd_t *pmd_base;
  998. pud_t *pud;
  999. pud = pud_start + pud_index(addr);
  1000. for (; addr < end; addr = next, pud++) {
  1001. next = pud_addr_end(addr, end);
  1002. if (!pud_present(*pud))
  1003. continue;
  1004. if (pud_leaf(*pud) &&
  1005. IS_ALIGNED(addr, PUD_SIZE) &&
  1006. IS_ALIGNED(next, PUD_SIZE)) {
  1007. spin_lock(&init_mm.page_table_lock);
  1008. pud_clear(pud);
  1009. spin_unlock(&init_mm.page_table_lock);
  1010. pages++;
  1011. continue;
  1012. }
  1013. pmd_base = pmd_offset(pud, 0);
  1014. remove_pmd_table(pmd_base, addr, next, direct, altmap);
  1015. free_pmd_table(pmd_base, pud);
  1016. }
  1017. if (direct)
  1018. update_page_count(PG_LEVEL_1G, -pages);
  1019. }
  1020. static void __meminit
  1021. remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
  1022. struct vmem_altmap *altmap, bool direct)
  1023. {
  1024. unsigned long next, pages = 0;
  1025. pud_t *pud_base;
  1026. p4d_t *p4d;
  1027. p4d = p4d_start + p4d_index(addr);
  1028. for (; addr < end; addr = next, p4d++) {
  1029. next = p4d_addr_end(addr, end);
  1030. if (!p4d_present(*p4d))
  1031. continue;
  1032. BUILD_BUG_ON(p4d_leaf(*p4d));
  1033. pud_base = pud_offset(p4d, 0);
  1034. remove_pud_table(pud_base, addr, next, altmap, direct);
  1035. /*
  1036. * For 4-level page tables we do not want to free PUDs, but in the
  1037. * 5-level case we should free them. This code will have to change
  1038. * to adapt for boot-time switching between 4 and 5 level page tables.
  1039. */
  1040. if (pgtable_l5_enabled())
  1041. free_pud_table(pud_base, p4d);
  1042. }
  1043. if (direct)
  1044. update_page_count(PG_LEVEL_512G, -pages);
  1045. }
  1046. /* start and end are both virtual address. */
  1047. static void __meminit
  1048. remove_pagetable(unsigned long start, unsigned long end, bool direct,
  1049. struct vmem_altmap *altmap)
  1050. {
  1051. unsigned long next;
  1052. unsigned long addr;
  1053. pgd_t *pgd;
  1054. p4d_t *p4d;
  1055. for (addr = start; addr < end; addr = next) {
  1056. next = pgd_addr_end(addr, end);
  1057. pgd = pgd_offset_k(addr);
  1058. if (!pgd_present(*pgd))
  1059. continue;
  1060. p4d = p4d_offset(pgd, 0);
  1061. remove_p4d_table(p4d, addr, next, altmap, direct);
  1062. }
  1063. flush_tlb_all();
  1064. }
  1065. void __ref vmemmap_free(unsigned long start, unsigned long end,
  1066. struct vmem_altmap *altmap)
  1067. {
  1068. VM_BUG_ON(!PAGE_ALIGNED(start));
  1069. VM_BUG_ON(!PAGE_ALIGNED(end));
  1070. remove_pagetable(start, end, false, altmap);
  1071. }
  1072. static void __meminit
  1073. kernel_physical_mapping_remove(unsigned long start, unsigned long end)
  1074. {
  1075. start = (unsigned long)__va(start);
  1076. end = (unsigned long)__va(end);
  1077. remove_pagetable(start, end, true, NULL);
  1078. }
  1079. void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
  1080. {
  1081. unsigned long start_pfn = start >> PAGE_SHIFT;
  1082. unsigned long nr_pages = size >> PAGE_SHIFT;
  1083. __remove_pages(start_pfn, nr_pages, altmap);
  1084. kernel_physical_mapping_remove(start, start + size);
  1085. }
  1086. #endif /* CONFIG_MEMORY_HOTPLUG */
  1087. static struct kcore_list kcore_vsyscall;
  1088. static void __init register_page_bootmem_info(void)
  1089. {
  1090. #if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
  1091. int i;
  1092. for_each_online_node(i)
  1093. register_page_bootmem_info_node(NODE_DATA(i));
  1094. #endif
  1095. }
  1096. /*
  1097. * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
  1098. * Only the level which needs to be synchronized between all page-tables is
  1099. * allocated because the synchronization can be expensive.
  1100. */
  1101. static void __init preallocate_vmalloc_pages(void)
  1102. {
  1103. unsigned long addr;
  1104. const char *lvl;
  1105. for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
  1106. pgd_t *pgd = pgd_offset_k(addr);
  1107. p4d_t *p4d;
  1108. pud_t *pud;
  1109. lvl = "p4d";
  1110. p4d = p4d_alloc(&init_mm, pgd, addr);
  1111. if (!p4d)
  1112. goto failed;
  1113. if (pgtable_l5_enabled())
  1114. continue;
  1115. /*
  1116. * The goal here is to allocate all possibly required
  1117. * hardware page tables pointed to by the top hardware
  1118. * level.
  1119. *
  1120. * On 4-level systems, the P4D layer is folded away and
  1121. * the above code does no preallocation. Below, go down
  1122. * to the pud _software_ level to ensure the second
  1123. * hardware level is allocated on 4-level systems too.
  1124. */
  1125. lvl = "pud";
  1126. pud = pud_alloc(&init_mm, p4d, addr);
  1127. if (!pud)
  1128. goto failed;
  1129. }
  1130. return;
  1131. failed:
  1132. /*
  1133. * The pages have to be there now or they will be missing in
  1134. * process page-tables later.
  1135. */
  1136. panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
  1137. }
  1138. void __init mem_init(void)
  1139. {
  1140. pci_iommu_alloc();
  1141. /* clear_bss() already clear the empty_zero_page */
  1142. /* this will put all memory onto the freelists */
  1143. memblock_free_all();
  1144. after_bootmem = 1;
  1145. x86_init.hyper.init_after_bootmem();
  1146. /*
  1147. * Must be done after boot memory is put on freelist, because here we
  1148. * might set fields in deferred struct pages that have not yet been
  1149. * initialized, and memblock_free_all() initializes all the reserved
  1150. * deferred pages for us.
  1151. */
  1152. register_page_bootmem_info();
  1153. /* Register memory areas for /proc/kcore */
  1154. if (get_gate_vma(&init_mm))
  1155. kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
  1156. preallocate_vmalloc_pages();
  1157. }
  1158. int kernel_set_to_readonly;
  1159. void mark_rodata_ro(void)
  1160. {
  1161. unsigned long start = PFN_ALIGN(_text);
  1162. unsigned long rodata_start = PFN_ALIGN(__start_rodata);
  1163. unsigned long end = (unsigned long)__end_rodata_hpage_align;
  1164. unsigned long text_end = PFN_ALIGN(_etext);
  1165. unsigned long rodata_end = PFN_ALIGN(__end_rodata);
  1166. unsigned long all_end;
  1167. printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
  1168. (end - start) >> 10);
  1169. set_memory_ro(start, (end - start) >> PAGE_SHIFT);
  1170. kernel_set_to_readonly = 1;
  1171. /*
  1172. * The rodata/data/bss/brk section (but not the kernel text!)
  1173. * should also be not-executable.
  1174. *
  1175. * We align all_end to PMD_SIZE because the existing mapping
  1176. * is a full PMD. If we would align _brk_end to PAGE_SIZE we
  1177. * split the PMD and the reminder between _brk_end and the end
  1178. * of the PMD will remain mapped executable.
  1179. *
  1180. * Any PMD which was setup after the one which covers _brk_end
  1181. * has been zapped already via cleanup_highmem().
  1182. */
  1183. all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
  1184. set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
  1185. set_ftrace_ops_ro();
  1186. #ifdef CONFIG_CPA_DEBUG
  1187. printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
  1188. set_memory_rw(start, (end-start) >> PAGE_SHIFT);
  1189. printk(KERN_INFO "Testing CPA: again\n");
  1190. set_memory_ro(start, (end-start) >> PAGE_SHIFT);
  1191. #endif
  1192. free_kernel_image_pages("unused kernel image (text/rodata gap)",
  1193. (void *)text_end, (void *)rodata_start);
  1194. free_kernel_image_pages("unused kernel image (rodata/data gap)",
  1195. (void *)rodata_end, (void *)_sdata);
  1196. }
  1197. /*
  1198. * Block size is the minimum amount of memory which can be hotplugged or
  1199. * hotremoved. It must be power of two and must be equal or larger than
  1200. * MIN_MEMORY_BLOCK_SIZE.
  1201. */
  1202. #define MAX_BLOCK_SIZE (2UL << 30)
  1203. /* Amount of ram needed to start using large blocks */
  1204. #define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
  1205. /* Adjustable memory block size */
  1206. static unsigned long set_memory_block_size;
  1207. int __init set_memory_block_size_order(unsigned int order)
  1208. {
  1209. unsigned long size = 1UL << order;
  1210. if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
  1211. return -EINVAL;
  1212. set_memory_block_size = size;
  1213. return 0;
  1214. }
  1215. static unsigned long probe_memory_block_size(void)
  1216. {
  1217. unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
  1218. unsigned long bz;
  1219. /* If memory block size has been set, then use it */
  1220. bz = set_memory_block_size;
  1221. if (bz)
  1222. goto done;
  1223. /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
  1224. if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
  1225. bz = MIN_MEMORY_BLOCK_SIZE;
  1226. goto done;
  1227. }
  1228. /*
  1229. * Use max block size to minimize overhead on bare metal, where
  1230. * alignment for memory hotplug isn't a concern.
  1231. */
  1232. if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
  1233. bz = MAX_BLOCK_SIZE;
  1234. goto done;
  1235. }
  1236. /* Find the largest allowed block size that aligns to memory end */
  1237. for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
  1238. if (IS_ALIGNED(boot_mem_end, bz))
  1239. break;
  1240. }
  1241. done:
  1242. pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
  1243. return bz;
  1244. }
  1245. static unsigned long memory_block_size_probed;
  1246. unsigned long memory_block_size_bytes(void)
  1247. {
  1248. if (!memory_block_size_probed)
  1249. memory_block_size_probed = probe_memory_block_size();
  1250. return memory_block_size_probed;
  1251. }
  1252. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  1253. /*
  1254. * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  1255. */
  1256. static long __meminitdata addr_start, addr_end;
  1257. static void __meminitdata *p_start, *p_end;
  1258. static int __meminitdata node_start;
  1259. void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
  1260. unsigned long addr, unsigned long next)
  1261. {
  1262. pte_t entry;
  1263. entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
  1264. PAGE_KERNEL_LARGE);
  1265. set_pmd(pmd, __pmd(pte_val(entry)));
  1266. /* check to see if we have contiguous blocks */
  1267. if (p_end != p || node_start != node) {
  1268. if (p_start)
  1269. pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
  1270. addr_start, addr_end-1, p_start, p_end-1, node_start);
  1271. addr_start = addr;
  1272. node_start = node;
  1273. p_start = p;
  1274. }
  1275. addr_end = addr + PMD_SIZE;
  1276. p_end = p + PMD_SIZE;
  1277. if (!IS_ALIGNED(addr, PMD_SIZE) ||
  1278. !IS_ALIGNED(next, PMD_SIZE))
  1279. vmemmap_use_new_sub_pmd(addr, next);
  1280. }
  1281. int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
  1282. unsigned long addr, unsigned long next)
  1283. {
  1284. int large = pmd_leaf(*pmd);
  1285. if (pmd_leaf(*pmd)) {
  1286. vmemmap_verify((pte_t *)pmd, node, addr, next);
  1287. vmemmap_use_sub_pmd(addr, next);
  1288. }
  1289. return large;
  1290. }
  1291. int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
  1292. struct vmem_altmap *altmap)
  1293. {
  1294. int err;
  1295. VM_BUG_ON(!PAGE_ALIGNED(start));
  1296. VM_BUG_ON(!PAGE_ALIGNED(end));
  1297. if (end - start < PAGES_PER_SECTION * sizeof(struct page))
  1298. err = vmemmap_populate_basepages(start, end, node, NULL);
  1299. else if (boot_cpu_has(X86_FEATURE_PSE))
  1300. err = vmemmap_populate_hugepages(start, end, node, altmap);
  1301. else if (altmap) {
  1302. pr_err_once("%s: no cpu support for altmap allocations\n",
  1303. __func__);
  1304. err = -ENOMEM;
  1305. } else
  1306. err = vmemmap_populate_basepages(start, end, node, NULL);
  1307. if (!err)
  1308. sync_global_pgds(start, end - 1);
  1309. return err;
  1310. }
  1311. #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
  1312. void register_page_bootmem_memmap(unsigned long section_nr,
  1313. struct page *start_page, unsigned long nr_pages)
  1314. {
  1315. unsigned long addr = (unsigned long)start_page;
  1316. unsigned long end = (unsigned long)(start_page + nr_pages);
  1317. unsigned long next;
  1318. pgd_t *pgd;
  1319. p4d_t *p4d;
  1320. pud_t *pud;
  1321. pmd_t *pmd;
  1322. unsigned int nr_pmd_pages;
  1323. struct page *page;
  1324. for (; addr < end; addr = next) {
  1325. pte_t *pte = NULL;
  1326. pgd = pgd_offset_k(addr);
  1327. if (pgd_none(*pgd)) {
  1328. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1329. continue;
  1330. }
  1331. get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
  1332. p4d = p4d_offset(pgd, addr);
  1333. if (p4d_none(*p4d)) {
  1334. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1335. continue;
  1336. }
  1337. get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
  1338. pud = pud_offset(p4d, addr);
  1339. if (pud_none(*pud)) {
  1340. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1341. continue;
  1342. }
  1343. get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
  1344. if (!boot_cpu_has(X86_FEATURE_PSE)) {
  1345. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1346. pmd = pmd_offset(pud, addr);
  1347. if (pmd_none(*pmd))
  1348. continue;
  1349. get_page_bootmem(section_nr, pmd_page(*pmd),
  1350. MIX_SECTION_INFO);
  1351. pte = pte_offset_kernel(pmd, addr);
  1352. if (pte_none(*pte))
  1353. continue;
  1354. get_page_bootmem(section_nr, pte_page(*pte),
  1355. SECTION_INFO);
  1356. } else {
  1357. next = pmd_addr_end(addr, end);
  1358. pmd = pmd_offset(pud, addr);
  1359. if (pmd_none(*pmd))
  1360. continue;
  1361. nr_pmd_pages = 1 << get_order(PMD_SIZE);
  1362. page = pmd_page(*pmd);
  1363. while (nr_pmd_pages--)
  1364. get_page_bootmem(section_nr, page++,
  1365. SECTION_INFO);
  1366. }
  1367. }
  1368. }
  1369. #endif
  1370. void __meminit vmemmap_populate_print_last(void)
  1371. {
  1372. if (p_start) {
  1373. pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
  1374. addr_start, addr_end-1, p_start, p_end-1, node_start);
  1375. p_start = NULL;
  1376. p_end = NULL;
  1377. node_start = 0;
  1378. }
  1379. }
  1380. #endif