mmu.c 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Based on arch/arm/mm/mmu.c
  4. *
  5. * Copyright (C) 1995-2005 Russell King
  6. * Copyright (C) 2012 ARM Ltd.
  7. */
  8. #include <linux/cache.h>
  9. #include <linux/export.h>
  10. #include <linux/kernel.h>
  11. #include <linux/errno.h>
  12. #include <linux/init.h>
  13. #include <linux/ioport.h>
  14. #include <linux/kexec.h>
  15. #include <linux/libfdt.h>
  16. #include <linux/mman.h>
  17. #include <linux/nodemask.h>
  18. #include <linux/memblock.h>
  19. #include <linux/memremap.h>
  20. #include <linux/memory.h>
  21. #include <linux/fs.h>
  22. #include <linux/io.h>
  23. #include <linux/mm.h>
  24. #include <linux/vmalloc.h>
  25. #include <linux/set_memory.h>
  26. #include <linux/kfence.h>
  27. #include <linux/pkeys.h>
  28. #include <asm/barrier.h>
  29. #include <asm/cputype.h>
  30. #include <asm/fixmap.h>
  31. #include <asm/kasan.h>
  32. #include <asm/kernel-pgtable.h>
  33. #include <asm/sections.h>
  34. #include <asm/setup.h>
  35. #include <linux/sizes.h>
  36. #include <asm/tlb.h>
  37. #include <asm/mmu_context.h>
  38. #include <asm/ptdump.h>
  39. #include <asm/tlbflush.h>
  40. #include <asm/pgalloc.h>
  41. #include <asm/kfence.h>
  42. #define NO_BLOCK_MAPPINGS BIT(0)
  43. #define NO_CONT_MAPPINGS BIT(1)
  44. #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
  45. u64 kimage_voffset __ro_after_init;
  46. EXPORT_SYMBOL(kimage_voffset);
  47. u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
  48. static bool rodata_is_rw __ro_after_init = true;
  49. /*
  50. * The booting CPU updates the failed status @__early_cpu_boot_status,
  51. * with MMU turned off.
  52. */
  53. long __section(".mmuoff.data.write") __early_cpu_boot_status;
  54. /*
  55. * Empty_zero_page is a special page that is used for zero-initialized data
  56. * and COW.
  57. */
  58. unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  59. EXPORT_SYMBOL(empty_zero_page);
  60. static DEFINE_SPINLOCK(swapper_pgdir_lock);
  61. static DEFINE_MUTEX(fixmap_lock);
  62. void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
  63. {
  64. pgd_t *fixmap_pgdp;
  65. /*
  66. * Don't bother with the fixmap if swapper_pg_dir is still mapped
  67. * writable in the kernel mapping.
  68. */
  69. if (rodata_is_rw) {
  70. WRITE_ONCE(*pgdp, pgd);
  71. dsb(ishst);
  72. isb();
  73. return;
  74. }
  75. spin_lock(&swapper_pgdir_lock);
  76. fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
  77. WRITE_ONCE(*fixmap_pgdp, pgd);
  78. /*
  79. * We need dsb(ishst) here to ensure the page-table-walker sees
  80. * our new entry before set_p?d() returns. The fixmap's
  81. * flush_tlb_kernel_range() via clear_fixmap() does this for us.
  82. */
  83. pgd_clear_fixmap();
  84. spin_unlock(&swapper_pgdir_lock);
  85. }
  86. pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  87. unsigned long size, pgprot_t vma_prot)
  88. {
  89. if (!pfn_is_map_memory(pfn))
  90. return pgprot_noncached(vma_prot);
  91. else if (file->f_flags & O_SYNC)
  92. return pgprot_writecombine(vma_prot);
  93. return vma_prot;
  94. }
  95. EXPORT_SYMBOL(phys_mem_access_prot);
  96. static phys_addr_t __init early_pgtable_alloc(int shift)
  97. {
  98. phys_addr_t phys;
  99. phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
  100. MEMBLOCK_ALLOC_NOLEAKTRACE);
  101. if (!phys)
  102. panic("Failed to allocate page table page\n");
  103. return phys;
  104. }
  105. bool pgattr_change_is_safe(u64 old, u64 new)
  106. {
  107. /*
  108. * The following mapping attributes may be updated in live
  109. * kernel mappings without the need for break-before-make.
  110. */
  111. pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG |
  112. PTE_SWBITS_MASK;
  113. /* creating or taking down mappings is always safe */
  114. if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
  115. return true;
  116. /* A live entry's pfn should not change */
  117. if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
  118. return false;
  119. /* live contiguous mappings may not be manipulated at all */
  120. if ((old | new) & PTE_CONT)
  121. return false;
  122. /* Transitioning from Non-Global to Global is unsafe */
  123. if (old & ~new & PTE_NG)
  124. return false;
  125. /*
  126. * Changing the memory type between Normal and Normal-Tagged is safe
  127. * since Tagged is considered a permission attribute from the
  128. * mismatched attribute aliases perspective.
  129. */
  130. if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
  131. (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
  132. ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
  133. (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
  134. mask |= PTE_ATTRINDX_MASK;
  135. return ((old ^ new) & ~mask) == 0;
  136. }
  137. static void init_clear_pgtable(void *table)
  138. {
  139. clear_page(table);
  140. /* Ensure the zeroing is observed by page table walks. */
  141. dsb(ishst);
  142. }
  143. static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
  144. phys_addr_t phys, pgprot_t prot)
  145. {
  146. do {
  147. pte_t old_pte = __ptep_get(ptep);
  148. /*
  149. * Required barriers to make this visible to the table walker
  150. * are deferred to the end of alloc_init_cont_pte().
  151. */
  152. __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));
  153. /*
  154. * After the PTE entry has been populated once, we
  155. * only allow updates to the permission attributes.
  156. */
  157. BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
  158. pte_val(__ptep_get(ptep))));
  159. phys += PAGE_SIZE;
  160. } while (ptep++, addr += PAGE_SIZE, addr != end);
  161. }
  162. static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
  163. unsigned long end, phys_addr_t phys,
  164. pgprot_t prot,
  165. phys_addr_t (*pgtable_alloc)(int),
  166. int flags)
  167. {
  168. unsigned long next;
  169. pmd_t pmd = READ_ONCE(*pmdp);
  170. pte_t *ptep;
  171. BUG_ON(pmd_sect(pmd));
  172. if (pmd_none(pmd)) {
  173. pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
  174. phys_addr_t pte_phys;
  175. if (flags & NO_EXEC_MAPPINGS)
  176. pmdval |= PMD_TABLE_PXN;
  177. BUG_ON(!pgtable_alloc);
  178. pte_phys = pgtable_alloc(PAGE_SHIFT);
  179. ptep = pte_set_fixmap(pte_phys);
  180. init_clear_pgtable(ptep);
  181. ptep += pte_index(addr);
  182. __pmd_populate(pmdp, pte_phys, pmdval);
  183. } else {
  184. BUG_ON(pmd_bad(pmd));
  185. ptep = pte_set_fixmap_offset(pmdp, addr);
  186. }
  187. do {
  188. pgprot_t __prot = prot;
  189. next = pte_cont_addr_end(addr, end);
  190. /* use a contiguous mapping if the range is suitably aligned */
  191. if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
  192. (flags & NO_CONT_MAPPINGS) == 0)
  193. __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
  194. init_pte(ptep, addr, next, phys, __prot);
  195. ptep += pte_index(next) - pte_index(addr);
  196. phys += next - addr;
  197. } while (addr = next, addr != end);
  198. /*
  199. * Note: barriers and maintenance necessary to clear the fixmap slot
  200. * ensure that all previous pgtable writes are visible to the table
  201. * walker.
  202. */
  203. pte_clear_fixmap();
  204. }
  205. static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
  206. phys_addr_t phys, pgprot_t prot,
  207. phys_addr_t (*pgtable_alloc)(int), int flags)
  208. {
  209. unsigned long next;
  210. do {
  211. pmd_t old_pmd = READ_ONCE(*pmdp);
  212. next = pmd_addr_end(addr, end);
  213. /* try section mapping first */
  214. if (((addr | next | phys) & ~PMD_MASK) == 0 &&
  215. (flags & NO_BLOCK_MAPPINGS) == 0) {
  216. pmd_set_huge(pmdp, phys, prot);
  217. /*
  218. * After the PMD entry has been populated once, we
  219. * only allow updates to the permission attributes.
  220. */
  221. BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
  222. READ_ONCE(pmd_val(*pmdp))));
  223. } else {
  224. alloc_init_cont_pte(pmdp, addr, next, phys, prot,
  225. pgtable_alloc, flags);
  226. BUG_ON(pmd_val(old_pmd) != 0 &&
  227. pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
  228. }
  229. phys += next - addr;
  230. } while (pmdp++, addr = next, addr != end);
  231. }
  232. static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
  233. unsigned long end, phys_addr_t phys,
  234. pgprot_t prot,
  235. phys_addr_t (*pgtable_alloc)(int), int flags)
  236. {
  237. unsigned long next;
  238. pud_t pud = READ_ONCE(*pudp);
  239. pmd_t *pmdp;
  240. /*
  241. * Check for initial section mappings in the pgd/pud.
  242. */
  243. BUG_ON(pud_sect(pud));
  244. if (pud_none(pud)) {
  245. pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
  246. phys_addr_t pmd_phys;
  247. if (flags & NO_EXEC_MAPPINGS)
  248. pudval |= PUD_TABLE_PXN;
  249. BUG_ON(!pgtable_alloc);
  250. pmd_phys = pgtable_alloc(PMD_SHIFT);
  251. pmdp = pmd_set_fixmap(pmd_phys);
  252. init_clear_pgtable(pmdp);
  253. pmdp += pmd_index(addr);
  254. __pud_populate(pudp, pmd_phys, pudval);
  255. } else {
  256. BUG_ON(pud_bad(pud));
  257. pmdp = pmd_set_fixmap_offset(pudp, addr);
  258. }
  259. do {
  260. pgprot_t __prot = prot;
  261. next = pmd_cont_addr_end(addr, end);
  262. /* use a contiguous mapping if the range is suitably aligned */
  263. if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
  264. (flags & NO_CONT_MAPPINGS) == 0)
  265. __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
  266. init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
  267. pmdp += pmd_index(next) - pmd_index(addr);
  268. phys += next - addr;
  269. } while (addr = next, addr != end);
  270. pmd_clear_fixmap();
  271. }
  272. static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
  273. phys_addr_t phys, pgprot_t prot,
  274. phys_addr_t (*pgtable_alloc)(int),
  275. int flags)
  276. {
  277. unsigned long next;
  278. p4d_t p4d = READ_ONCE(*p4dp);
  279. pud_t *pudp;
  280. if (p4d_none(p4d)) {
  281. p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
  282. phys_addr_t pud_phys;
  283. if (flags & NO_EXEC_MAPPINGS)
  284. p4dval |= P4D_TABLE_PXN;
  285. BUG_ON(!pgtable_alloc);
  286. pud_phys = pgtable_alloc(PUD_SHIFT);
  287. pudp = pud_set_fixmap(pud_phys);
  288. init_clear_pgtable(pudp);
  289. pudp += pud_index(addr);
  290. __p4d_populate(p4dp, pud_phys, p4dval);
  291. } else {
  292. BUG_ON(p4d_bad(p4d));
  293. pudp = pud_set_fixmap_offset(p4dp, addr);
  294. }
  295. do {
  296. pud_t old_pud = READ_ONCE(*pudp);
  297. next = pud_addr_end(addr, end);
  298. /*
  299. * For 4K granule only, attempt to put down a 1GB block
  300. */
  301. if (pud_sect_supported() &&
  302. ((addr | next | phys) & ~PUD_MASK) == 0 &&
  303. (flags & NO_BLOCK_MAPPINGS) == 0) {
  304. pud_set_huge(pudp, phys, prot);
  305. /*
  306. * After the PUD entry has been populated once, we
  307. * only allow updates to the permission attributes.
  308. */
  309. BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
  310. READ_ONCE(pud_val(*pudp))));
  311. } else {
  312. alloc_init_cont_pmd(pudp, addr, next, phys, prot,
  313. pgtable_alloc, flags);
  314. BUG_ON(pud_val(old_pud) != 0 &&
  315. pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
  316. }
  317. phys += next - addr;
  318. } while (pudp++, addr = next, addr != end);
  319. pud_clear_fixmap();
  320. }
  321. static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
  322. phys_addr_t phys, pgprot_t prot,
  323. phys_addr_t (*pgtable_alloc)(int),
  324. int flags)
  325. {
  326. unsigned long next;
  327. pgd_t pgd = READ_ONCE(*pgdp);
  328. p4d_t *p4dp;
  329. if (pgd_none(pgd)) {
  330. pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN;
  331. phys_addr_t p4d_phys;
  332. if (flags & NO_EXEC_MAPPINGS)
  333. pgdval |= PGD_TABLE_PXN;
  334. BUG_ON(!pgtable_alloc);
  335. p4d_phys = pgtable_alloc(P4D_SHIFT);
  336. p4dp = p4d_set_fixmap(p4d_phys);
  337. init_clear_pgtable(p4dp);
  338. p4dp += p4d_index(addr);
  339. __pgd_populate(pgdp, p4d_phys, pgdval);
  340. } else {
  341. BUG_ON(pgd_bad(pgd));
  342. p4dp = p4d_set_fixmap_offset(pgdp, addr);
  343. }
  344. do {
  345. p4d_t old_p4d = READ_ONCE(*p4dp);
  346. next = p4d_addr_end(addr, end);
  347. alloc_init_pud(p4dp, addr, next, phys, prot,
  348. pgtable_alloc, flags);
  349. BUG_ON(p4d_val(old_p4d) != 0 &&
  350. p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
  351. phys += next - addr;
  352. } while (p4dp++, addr = next, addr != end);
  353. p4d_clear_fixmap();
  354. }
  355. static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
  356. unsigned long virt, phys_addr_t size,
  357. pgprot_t prot,
  358. phys_addr_t (*pgtable_alloc)(int),
  359. int flags)
  360. {
  361. unsigned long addr, end, next;
  362. pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
  363. /*
  364. * If the virtual and physical address don't have the same offset
  365. * within a page, we cannot map the region as the caller expects.
  366. */
  367. if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
  368. return;
  369. phys &= PAGE_MASK;
  370. addr = virt & PAGE_MASK;
  371. end = PAGE_ALIGN(virt + size);
  372. do {
  373. next = pgd_addr_end(addr, end);
  374. alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
  375. flags);
  376. phys += next - addr;
  377. } while (pgdp++, addr = next, addr != end);
  378. }
  379. static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
  380. unsigned long virt, phys_addr_t size,
  381. pgprot_t prot,
  382. phys_addr_t (*pgtable_alloc)(int),
  383. int flags)
  384. {
  385. mutex_lock(&fixmap_lock);
  386. __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
  387. pgtable_alloc, flags);
  388. mutex_unlock(&fixmap_lock);
  389. }
  390. #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
  391. extern __alias(__create_pgd_mapping_locked)
  392. void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
  393. phys_addr_t size, pgprot_t prot,
  394. phys_addr_t (*pgtable_alloc)(int), int flags);
  395. #endif
  396. static phys_addr_t __pgd_pgtable_alloc(int shift)
  397. {
  398. /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
  399. void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
  400. BUG_ON(!ptr);
  401. return __pa(ptr);
  402. }
  403. static phys_addr_t pgd_pgtable_alloc(int shift)
  404. {
  405. phys_addr_t pa = __pgd_pgtable_alloc(shift);
  406. struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
  407. /*
  408. * Call proper page table ctor in case later we need to
  409. * call core mm functions like apply_to_page_range() on
  410. * this pre-allocated page table.
  411. *
  412. * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
  413. * folded, and if so pagetable_pte_ctor() becomes nop.
  414. */
  415. if (shift == PAGE_SHIFT)
  416. BUG_ON(!pagetable_pte_ctor(ptdesc));
  417. else if (shift == PMD_SHIFT)
  418. BUG_ON(!pagetable_pmd_ctor(ptdesc));
  419. return pa;
  420. }
  421. /*
  422. * This function can only be used to modify existing table entries,
  423. * without allocating new levels of table. Note that this permits the
  424. * creation of new section or page entries.
  425. */
  426. void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
  427. phys_addr_t size, pgprot_t prot)
  428. {
  429. if (virt < PAGE_OFFSET) {
  430. pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
  431. &phys, virt);
  432. return;
  433. }
  434. __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
  435. NO_CONT_MAPPINGS);
  436. }
  437. void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
  438. unsigned long virt, phys_addr_t size,
  439. pgprot_t prot, bool page_mappings_only)
  440. {
  441. int flags = 0;
  442. BUG_ON(mm == &init_mm);
  443. if (page_mappings_only)
  444. flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
  445. __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
  446. pgd_pgtable_alloc, flags);
  447. }
  448. static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
  449. phys_addr_t size, pgprot_t prot)
  450. {
  451. if (virt < PAGE_OFFSET) {
  452. pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
  453. &phys, virt);
  454. return;
  455. }
  456. __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
  457. NO_CONT_MAPPINGS);
  458. /* flush the TLBs after updating live kernel mappings */
  459. flush_tlb_kernel_range(virt, virt + size);
  460. }
  461. static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
  462. phys_addr_t end, pgprot_t prot, int flags)
  463. {
  464. __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
  465. prot, early_pgtable_alloc, flags);
  466. }
  467. void __init mark_linear_text_alias_ro(void)
  468. {
  469. /*
  470. * Remove the write permissions from the linear alias of .text/.rodata
  471. */
  472. update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
  473. (unsigned long)__init_begin - (unsigned long)_stext,
  474. PAGE_KERNEL_RO);
  475. }
  476. #ifdef CONFIG_KFENCE
  477. bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
  478. /* early_param() will be parsed before map_mem() below. */
  479. static int __init parse_kfence_early_init(char *arg)
  480. {
  481. int val;
  482. if (get_option(&arg, &val))
  483. kfence_early_init = !!val;
  484. return 0;
  485. }
  486. early_param("kfence.sample_interval", parse_kfence_early_init);
  487. static phys_addr_t __init arm64_kfence_alloc_pool(void)
  488. {
  489. phys_addr_t kfence_pool;
  490. if (!kfence_early_init)
  491. return 0;
  492. kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
  493. if (!kfence_pool) {
  494. pr_err("failed to allocate kfence pool\n");
  495. kfence_early_init = false;
  496. return 0;
  497. }
  498. /* Temporarily mark as NOMAP. */
  499. memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
  500. return kfence_pool;
  501. }
  502. static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
  503. {
  504. if (!kfence_pool)
  505. return;
  506. /* KFENCE pool needs page-level mapping. */
  507. __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
  508. pgprot_tagged(PAGE_KERNEL),
  509. NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
  510. memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
  511. __kfence_pool = phys_to_virt(kfence_pool);
  512. }
  513. #else /* CONFIG_KFENCE */
  514. static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
  515. static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }
  516. #endif /* CONFIG_KFENCE */
  517. static void __init map_mem(pgd_t *pgdp)
  518. {
  519. static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
  520. phys_addr_t kernel_start = __pa_symbol(_stext);
  521. phys_addr_t kernel_end = __pa_symbol(__init_begin);
  522. phys_addr_t start, end;
  523. phys_addr_t early_kfence_pool;
  524. int flags = NO_EXEC_MAPPINGS;
  525. u64 i;
  526. /*
  527. * Setting hierarchical PXNTable attributes on table entries covering
  528. * the linear region is only possible if it is guaranteed that no table
  529. * entries at any level are being shared between the linear region and
  530. * the vmalloc region. Check whether this is true for the PGD level, in
  531. * which case it is guaranteed to be true for all other levels as well.
  532. * (Unless we are running with support for LPA2, in which case the
  533. * entire reduced VA space is covered by a single pgd_t which will have
  534. * been populated without the PXNTable attribute by the time we get here.)
  535. */
  536. BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) &&
  537. pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1);
  538. early_kfence_pool = arm64_kfence_alloc_pool();
  539. if (can_set_direct_map())
  540. flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
  541. /*
  542. * Take care not to create a writable alias for the
  543. * read-only text and rodata sections of the kernel image.
  544. * So temporarily mark them as NOMAP to skip mappings in
  545. * the following for-loop
  546. */
  547. memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
  548. /* map all the memory banks */
  549. for_each_mem_range(i, &start, &end) {
  550. if (start >= end)
  551. break;
  552. /*
  553. * The linear map must allow allocation tags reading/writing
  554. * if MTE is present. Otherwise, it has the same attributes as
  555. * PAGE_KERNEL.
  556. */
  557. __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
  558. flags);
  559. }
  560. /*
  561. * Map the linear alias of the [_stext, __init_begin) interval
  562. * as non-executable now, and remove the write permission in
  563. * mark_linear_text_alias_ro() below (which will be called after
  564. * alternative patching has completed). This makes the contents
  565. * of the region accessible to subsystems such as hibernate,
  566. * but protects it from inadvertent modification or execution.
  567. * Note that contiguous mappings cannot be remapped in this way,
  568. * so we should avoid them here.
  569. */
  570. __map_memblock(pgdp, kernel_start, kernel_end,
  571. PAGE_KERNEL, NO_CONT_MAPPINGS);
  572. memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
  573. arm64_kfence_map_pool(early_kfence_pool, pgdp);
  574. }
  575. void mark_rodata_ro(void)
  576. {
  577. unsigned long section_size;
  578. /*
  579. * mark .rodata as read only. Use __init_begin rather than __end_rodata
  580. * to cover NOTES and EXCEPTION_TABLE.
  581. */
  582. section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
  583. WRITE_ONCE(rodata_is_rw, false);
  584. update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
  585. section_size, PAGE_KERNEL_RO);
  586. }
  587. static void __init declare_vma(struct vm_struct *vma,
  588. void *va_start, void *va_end,
  589. unsigned long vm_flags)
  590. {
  591. phys_addr_t pa_start = __pa_symbol(va_start);
  592. unsigned long size = va_end - va_start;
  593. BUG_ON(!PAGE_ALIGNED(pa_start));
  594. BUG_ON(!PAGE_ALIGNED(size));
  595. if (!(vm_flags & VM_NO_GUARD))
  596. size += PAGE_SIZE;
  597. vma->addr = va_start;
  598. vma->phys_addr = pa_start;
  599. vma->size = size;
  600. vma->flags = VM_MAP | vm_flags;
  601. vma->caller = __builtin_return_address(0);
  602. vm_area_add_early(vma);
  603. }
  604. #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
  605. static pgprot_t kernel_exec_prot(void)
  606. {
  607. return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
  608. }
  609. static int __init map_entry_trampoline(void)
  610. {
  611. int i;
  612. if (!arm64_kernel_unmapped_at_el0())
  613. return 0;
  614. pgprot_t prot = kernel_exec_prot();
  615. phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
  616. /* The trampoline is always mapped and can therefore be global */
  617. pgprot_val(prot) &= ~PTE_NG;
  618. /* Map only the text into the trampoline page table */
  619. memset(tramp_pg_dir, 0, PGD_SIZE);
  620. __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
  621. entry_tramp_text_size(), prot,
  622. __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
  623. /* Map both the text and data into the kernel page table */
  624. for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
  625. __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
  626. pa_start + i * PAGE_SIZE, prot);
  627. if (IS_ENABLED(CONFIG_RELOCATABLE))
  628. __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
  629. pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
  630. return 0;
  631. }
  632. core_initcall(map_entry_trampoline);
  633. #endif
  634. /*
  635. * Declare the VMA areas for the kernel
  636. */
  637. static void __init declare_kernel_vmas(void)
  638. {
  639. static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
  640. declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
  641. declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
  642. declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
  643. declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
  644. declare_vma(&vmlinux_seg[4], _data, _end, 0);
  645. }
  646. void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
  647. int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
  648. static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
  649. kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
  650. static void __init create_idmap(void)
  651. {
  652. u64 start = __pa_symbol(__idmap_text_start);
  653. u64 end = __pa_symbol(__idmap_text_end);
  654. u64 ptep = __pa_symbol(idmap_ptes);
  655. __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
  656. IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
  657. __phys_to_virt(ptep) - ptep);
  658. if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
  659. extern u32 __idmap_kpti_flag;
  660. u64 pa = __pa_symbol(&__idmap_kpti_flag);
  661. /*
  662. * The KPTI G-to-nG conversion code needs a read-write mapping
  663. * of its synchronization flag in the ID map.
  664. */
  665. ptep = __pa_symbol(kpti_ptes);
  666. __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
  667. IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
  668. __phys_to_virt(ptep) - ptep);
  669. }
  670. }
  671. void __init paging_init(void)
  672. {
  673. map_mem(swapper_pg_dir);
  674. memblock_allow_resize();
  675. create_idmap();
  676. declare_kernel_vmas();
  677. }
  678. #ifdef CONFIG_MEMORY_HOTPLUG
  679. static void free_hotplug_page_range(struct page *page, size_t size,
  680. struct vmem_altmap *altmap)
  681. {
  682. if (altmap) {
  683. vmem_altmap_free(altmap, size >> PAGE_SHIFT);
  684. } else {
  685. WARN_ON(PageReserved(page));
  686. free_pages((unsigned long)page_address(page), get_order(size));
  687. }
  688. }
  689. static void free_hotplug_pgtable_page(struct page *page)
  690. {
  691. free_hotplug_page_range(page, PAGE_SIZE, NULL);
  692. }
  693. static bool pgtable_range_aligned(unsigned long start, unsigned long end,
  694. unsigned long floor, unsigned long ceiling,
  695. unsigned long mask)
  696. {
  697. start &= mask;
  698. if (start < floor)
  699. return false;
  700. if (ceiling) {
  701. ceiling &= mask;
  702. if (!ceiling)
  703. return false;
  704. }
  705. if (end - 1 > ceiling - 1)
  706. return false;
  707. return true;
  708. }
  709. static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
  710. unsigned long end, bool free_mapped,
  711. struct vmem_altmap *altmap)
  712. {
  713. pte_t *ptep, pte;
  714. do {
  715. ptep = pte_offset_kernel(pmdp, addr);
  716. pte = __ptep_get(ptep);
  717. if (pte_none(pte))
  718. continue;
  719. WARN_ON(!pte_present(pte));
  720. __pte_clear(&init_mm, addr, ptep);
  721. flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
  722. if (free_mapped)
  723. free_hotplug_page_range(pte_page(pte),
  724. PAGE_SIZE, altmap);
  725. } while (addr += PAGE_SIZE, addr < end);
  726. }
  727. static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
  728. unsigned long end, bool free_mapped,
  729. struct vmem_altmap *altmap)
  730. {
  731. unsigned long next;
  732. pmd_t *pmdp, pmd;
  733. do {
  734. next = pmd_addr_end(addr, end);
  735. pmdp = pmd_offset(pudp, addr);
  736. pmd = READ_ONCE(*pmdp);
  737. if (pmd_none(pmd))
  738. continue;
  739. WARN_ON(!pmd_present(pmd));
  740. if (pmd_sect(pmd)) {
  741. pmd_clear(pmdp);
  742. /*
  743. * One TLBI should be sufficient here as the PMD_SIZE
  744. * range is mapped with a single block entry.
  745. */
  746. flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
  747. if (free_mapped)
  748. free_hotplug_page_range(pmd_page(pmd),
  749. PMD_SIZE, altmap);
  750. continue;
  751. }
  752. WARN_ON(!pmd_table(pmd));
  753. unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
  754. } while (addr = next, addr < end);
  755. }
  756. static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
  757. unsigned long end, bool free_mapped,
  758. struct vmem_altmap *altmap)
  759. {
  760. unsigned long next;
  761. pud_t *pudp, pud;
  762. do {
  763. next = pud_addr_end(addr, end);
  764. pudp = pud_offset(p4dp, addr);
  765. pud = READ_ONCE(*pudp);
  766. if (pud_none(pud))
  767. continue;
  768. WARN_ON(!pud_present(pud));
  769. if (pud_sect(pud)) {
  770. pud_clear(pudp);
  771. /*
  772. * One TLBI should be sufficient here as the PUD_SIZE
  773. * range is mapped with a single block entry.
  774. */
  775. flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
  776. if (free_mapped)
  777. free_hotplug_page_range(pud_page(pud),
  778. PUD_SIZE, altmap);
  779. continue;
  780. }
  781. WARN_ON(!pud_table(pud));
  782. unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
  783. } while (addr = next, addr < end);
  784. }
  785. static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
  786. unsigned long end, bool free_mapped,
  787. struct vmem_altmap *altmap)
  788. {
  789. unsigned long next;
  790. p4d_t *p4dp, p4d;
  791. do {
  792. next = p4d_addr_end(addr, end);
  793. p4dp = p4d_offset(pgdp, addr);
  794. p4d = READ_ONCE(*p4dp);
  795. if (p4d_none(p4d))
  796. continue;
  797. WARN_ON(!p4d_present(p4d));
  798. unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
  799. } while (addr = next, addr < end);
  800. }
  801. static void unmap_hotplug_range(unsigned long addr, unsigned long end,
  802. bool free_mapped, struct vmem_altmap *altmap)
  803. {
  804. unsigned long next;
  805. pgd_t *pgdp, pgd;
  806. /*
  807. * altmap can only be used as vmemmap mapping backing memory.
  808. * In case the backing memory itself is not being freed, then
  809. * altmap is irrelevant. Warn about this inconsistency when
  810. * encountered.
  811. */
  812. WARN_ON(!free_mapped && altmap);
  813. do {
  814. next = pgd_addr_end(addr, end);
  815. pgdp = pgd_offset_k(addr);
  816. pgd = READ_ONCE(*pgdp);
  817. if (pgd_none(pgd))
  818. continue;
  819. WARN_ON(!pgd_present(pgd));
  820. unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
  821. } while (addr = next, addr < end);
  822. }
  823. static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
  824. unsigned long end, unsigned long floor,
  825. unsigned long ceiling)
  826. {
  827. pte_t *ptep, pte;
  828. unsigned long i, start = addr;
  829. do {
  830. ptep = pte_offset_kernel(pmdp, addr);
  831. pte = __ptep_get(ptep);
  832. /*
  833. * This is just a sanity check here which verifies that
  834. * pte clearing has been done by earlier unmap loops.
  835. */
  836. WARN_ON(!pte_none(pte));
  837. } while (addr += PAGE_SIZE, addr < end);
  838. if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
  839. return;
  840. /*
  841. * Check whether we can free the pte page if the rest of the
  842. * entries are empty. Overlap with other regions have been
  843. * handled by the floor/ceiling check.
  844. */
  845. ptep = pte_offset_kernel(pmdp, 0UL);
  846. for (i = 0; i < PTRS_PER_PTE; i++) {
  847. if (!pte_none(__ptep_get(&ptep[i])))
  848. return;
  849. }
  850. pmd_clear(pmdp);
  851. __flush_tlb_kernel_pgtable(start);
  852. free_hotplug_pgtable_page(virt_to_page(ptep));
  853. }
  854. static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
  855. unsigned long end, unsigned long floor,
  856. unsigned long ceiling)
  857. {
  858. pmd_t *pmdp, pmd;
  859. unsigned long i, next, start = addr;
  860. do {
  861. next = pmd_addr_end(addr, end);
  862. pmdp = pmd_offset(pudp, addr);
  863. pmd = READ_ONCE(*pmdp);
  864. if (pmd_none(pmd))
  865. continue;
  866. WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
  867. free_empty_pte_table(pmdp, addr, next, floor, ceiling);
  868. } while (addr = next, addr < end);
  869. if (CONFIG_PGTABLE_LEVELS <= 2)
  870. return;
  871. if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
  872. return;
  873. /*
  874. * Check whether we can free the pmd page if the rest of the
  875. * entries are empty. Overlap with other regions have been
  876. * handled by the floor/ceiling check.
  877. */
  878. pmdp = pmd_offset(pudp, 0UL);
  879. for (i = 0; i < PTRS_PER_PMD; i++) {
  880. if (!pmd_none(READ_ONCE(pmdp[i])))
  881. return;
  882. }
  883. pud_clear(pudp);
  884. __flush_tlb_kernel_pgtable(start);
  885. free_hotplug_pgtable_page(virt_to_page(pmdp));
  886. }
  887. static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
  888. unsigned long end, unsigned long floor,
  889. unsigned long ceiling)
  890. {
  891. pud_t *pudp, pud;
  892. unsigned long i, next, start = addr;
  893. do {
  894. next = pud_addr_end(addr, end);
  895. pudp = pud_offset(p4dp, addr);
  896. pud = READ_ONCE(*pudp);
  897. if (pud_none(pud))
  898. continue;
  899. WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
  900. free_empty_pmd_table(pudp, addr, next, floor, ceiling);
  901. } while (addr = next, addr < end);
  902. if (!pgtable_l4_enabled())
  903. return;
  904. if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
  905. return;
  906. /*
  907. * Check whether we can free the pud page if the rest of the
  908. * entries are empty. Overlap with other regions have been
  909. * handled by the floor/ceiling check.
  910. */
  911. pudp = pud_offset(p4dp, 0UL);
  912. for (i = 0; i < PTRS_PER_PUD; i++) {
  913. if (!pud_none(READ_ONCE(pudp[i])))
  914. return;
  915. }
  916. p4d_clear(p4dp);
  917. __flush_tlb_kernel_pgtable(start);
  918. free_hotplug_pgtable_page(virt_to_page(pudp));
  919. }
  920. static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
  921. unsigned long end, unsigned long floor,
  922. unsigned long ceiling)
  923. {
  924. p4d_t *p4dp, p4d;
  925. unsigned long i, next, start = addr;
  926. do {
  927. next = p4d_addr_end(addr, end);
  928. p4dp = p4d_offset(pgdp, addr);
  929. p4d = READ_ONCE(*p4dp);
  930. if (p4d_none(p4d))
  931. continue;
  932. WARN_ON(!p4d_present(p4d));
  933. free_empty_pud_table(p4dp, addr, next, floor, ceiling);
  934. } while (addr = next, addr < end);
  935. if (!pgtable_l5_enabled())
  936. return;
  937. if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
  938. return;
  939. /*
  940. * Check whether we can free the p4d page if the rest of the
  941. * entries are empty. Overlap with other regions have been
  942. * handled by the floor/ceiling check.
  943. */
  944. p4dp = p4d_offset(pgdp, 0UL);
  945. for (i = 0; i < PTRS_PER_P4D; i++) {
  946. if (!p4d_none(READ_ONCE(p4dp[i])))
  947. return;
  948. }
  949. pgd_clear(pgdp);
  950. __flush_tlb_kernel_pgtable(start);
  951. free_hotplug_pgtable_page(virt_to_page(p4dp));
  952. }
  953. static void free_empty_tables(unsigned long addr, unsigned long end,
  954. unsigned long floor, unsigned long ceiling)
  955. {
  956. unsigned long next;
  957. pgd_t *pgdp, pgd;
  958. do {
  959. next = pgd_addr_end(addr, end);
  960. pgdp = pgd_offset_k(addr);
  961. pgd = READ_ONCE(*pgdp);
  962. if (pgd_none(pgd))
  963. continue;
  964. WARN_ON(!pgd_present(pgd));
  965. free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
  966. } while (addr = next, addr < end);
  967. }
  968. #endif
  969. void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
  970. unsigned long addr, unsigned long next)
  971. {
  972. pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
  973. }
  974. int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
  975. unsigned long addr, unsigned long next)
  976. {
  977. vmemmap_verify((pte_t *)pmdp, node, addr, next);
  978. return 1;
  979. }
  980. int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
  981. struct vmem_altmap *altmap)
  982. {
  983. WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
  984. if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
  985. return vmemmap_populate_basepages(start, end, node, altmap);
  986. else
  987. return vmemmap_populate_hugepages(start, end, node, altmap);
  988. }
  989. #ifdef CONFIG_MEMORY_HOTPLUG
  990. void vmemmap_free(unsigned long start, unsigned long end,
  991. struct vmem_altmap *altmap)
  992. {
  993. WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
  994. unmap_hotplug_range(start, end, true, altmap);
  995. free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
  996. }
  997. #endif /* CONFIG_MEMORY_HOTPLUG */
  998. int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
  999. {
  1000. pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
  1001. /* Only allow permission changes for now */
  1002. if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
  1003. pud_val(new_pud)))
  1004. return 0;
  1005. VM_BUG_ON(phys & ~PUD_MASK);
  1006. set_pud(pudp, new_pud);
  1007. return 1;
  1008. }
  1009. int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
  1010. {
  1011. pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
  1012. /* Only allow permission changes for now */
  1013. if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
  1014. pmd_val(new_pmd)))
  1015. return 0;
  1016. VM_BUG_ON(phys & ~PMD_MASK);
  1017. set_pmd(pmdp, new_pmd);
  1018. return 1;
  1019. }
  1020. #ifndef __PAGETABLE_P4D_FOLDED
  1021. void p4d_clear_huge(p4d_t *p4dp)
  1022. {
  1023. }
  1024. #endif
  1025. int pud_clear_huge(pud_t *pudp)
  1026. {
  1027. if (!pud_sect(READ_ONCE(*pudp)))
  1028. return 0;
  1029. pud_clear(pudp);
  1030. return 1;
  1031. }
  1032. int pmd_clear_huge(pmd_t *pmdp)
  1033. {
  1034. if (!pmd_sect(READ_ONCE(*pmdp)))
  1035. return 0;
  1036. pmd_clear(pmdp);
  1037. return 1;
  1038. }
  1039. int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
  1040. {
  1041. pte_t *table;
  1042. pmd_t pmd;
  1043. pmd = READ_ONCE(*pmdp);
  1044. if (!pmd_table(pmd)) {
  1045. VM_WARN_ON(1);
  1046. return 1;
  1047. }
  1048. table = pte_offset_kernel(pmdp, addr);
  1049. pmd_clear(pmdp);
  1050. __flush_tlb_kernel_pgtable(addr);
  1051. pte_free_kernel(NULL, table);
  1052. return 1;
  1053. }
  1054. int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
  1055. {
  1056. pmd_t *table;
  1057. pmd_t *pmdp;
  1058. pud_t pud;
  1059. unsigned long next, end;
  1060. pud = READ_ONCE(*pudp);
  1061. if (!pud_table(pud)) {
  1062. VM_WARN_ON(1);
  1063. return 1;
  1064. }
  1065. table = pmd_offset(pudp, addr);
  1066. pmdp = table;
  1067. next = addr;
  1068. end = addr + PUD_SIZE;
  1069. do {
  1070. pmd_free_pte_page(pmdp, next);
  1071. } while (pmdp++, next += PMD_SIZE, next != end);
  1072. pud_clear(pudp);
  1073. __flush_tlb_kernel_pgtable(addr);
  1074. pmd_free(NULL, table);
  1075. return 1;
  1076. }
  1077. #ifdef CONFIG_MEMORY_HOTPLUG
  1078. static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
  1079. {
  1080. unsigned long end = start + size;
  1081. WARN_ON(pgdir != init_mm.pgd);
  1082. WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
  1083. unmap_hotplug_range(start, end, false, NULL);
  1084. free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
  1085. }
  1086. struct range arch_get_mappable_range(void)
  1087. {
  1088. struct range mhp_range;
  1089. u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
  1090. u64 end_linear_pa = __pa(PAGE_END - 1);
  1091. if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
  1092. /*
  1093. * Check for a wrap, it is possible because of randomized linear
  1094. * mapping the start physical address is actually bigger than
  1095. * the end physical address. In this case set start to zero
  1096. * because [0, end_linear_pa] range must still be able to cover
  1097. * all addressable physical addresses.
  1098. */
  1099. if (start_linear_pa > end_linear_pa)
  1100. start_linear_pa = 0;
  1101. }
  1102. WARN_ON(start_linear_pa > end_linear_pa);
  1103. /*
  1104. * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
  1105. * accommodating both its ends but excluding PAGE_END. Max physical
  1106. * range which can be mapped inside this linear mapping range, must
  1107. * also be derived from its end points.
  1108. */
  1109. mhp_range.start = start_linear_pa;
  1110. mhp_range.end = end_linear_pa;
  1111. return mhp_range;
  1112. }
  1113. int arch_add_memory(int nid, u64 start, u64 size,
  1114. struct mhp_params *params)
  1115. {
  1116. int ret, flags = NO_EXEC_MAPPINGS;
  1117. VM_BUG_ON(!mhp_range_allowed(start, size, true));
  1118. if (can_set_direct_map())
  1119. flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
  1120. __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
  1121. size, params->pgprot, __pgd_pgtable_alloc,
  1122. flags);
  1123. memblock_clear_nomap(start, size);
  1124. ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
  1125. params);
  1126. if (ret)
  1127. __remove_pgd_mapping(swapper_pg_dir,
  1128. __phys_to_virt(start), size);
  1129. else {
  1130. max_pfn = PFN_UP(start + size);
  1131. max_low_pfn = max_pfn;
  1132. }
  1133. return ret;
  1134. }
  1135. void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
  1136. {
  1137. unsigned long start_pfn = start >> PAGE_SHIFT;
  1138. unsigned long nr_pages = size >> PAGE_SHIFT;
  1139. __remove_pages(start_pfn, nr_pages, altmap);
  1140. __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
  1141. }
  1142. /*
  1143. * This memory hotplug notifier helps prevent boot memory from being
  1144. * inadvertently removed as it blocks pfn range offlining process in
  1145. * __offline_pages(). Hence this prevents both offlining as well as
  1146. * removal process for boot memory which is initially always online.
  1147. * In future if and when boot memory could be removed, this notifier
  1148. * should be dropped and free_hotplug_page_range() should handle any
  1149. * reserved pages allocated during boot.
  1150. */
  1151. static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
  1152. unsigned long action, void *data)
  1153. {
  1154. struct mem_section *ms;
  1155. struct memory_notify *arg = data;
  1156. unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
  1157. unsigned long pfn = arg->start_pfn;
  1158. if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
  1159. return NOTIFY_OK;
  1160. for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
  1161. unsigned long start = PFN_PHYS(pfn);
  1162. unsigned long end = start + (1UL << PA_SECTION_SHIFT);
  1163. ms = __pfn_to_section(pfn);
  1164. if (!early_section(ms))
  1165. continue;
  1166. if (action == MEM_GOING_OFFLINE) {
  1167. /*
  1168. * Boot memory removal is not supported. Prevent
  1169. * it via blocking any attempted offline request
  1170. * for the boot memory and just report it.
  1171. */
  1172. pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
  1173. return NOTIFY_BAD;
  1174. } else if (action == MEM_OFFLINE) {
  1175. /*
  1176. * This should have never happened. Boot memory
  1177. * offlining should have been prevented by this
  1178. * very notifier. Probably some memory removal
  1179. * procedure might have changed which would then
  1180. * require further debug.
  1181. */
  1182. pr_err("Boot memory [%lx %lx] offlined\n", start, end);
  1183. /*
  1184. * Core memory hotplug does not process a return
  1185. * code from the notifier for MEM_OFFLINE events.
  1186. * The error condition has been reported. Return
  1187. * from here as if ignored.
  1188. */
  1189. return NOTIFY_DONE;
  1190. }
  1191. }
  1192. return NOTIFY_OK;
  1193. }
  1194. static struct notifier_block prevent_bootmem_remove_nb = {
  1195. .notifier_call = prevent_bootmem_remove_notifier,
  1196. };
  1197. /*
  1198. * This ensures that boot memory sections on the platform are online
  1199. * from early boot. Memory sections could not be prevented from being
  1200. * offlined, unless for some reason they are not online to begin with.
  1201. * This helps validate the basic assumption on which the above memory
  1202. * event notifier works to prevent boot memory section offlining and
  1203. * its possible removal.
  1204. */
  1205. static void validate_bootmem_online(void)
  1206. {
  1207. phys_addr_t start, end, addr;
  1208. struct mem_section *ms;
  1209. u64 i;
  1210. /*
  1211. * Scanning across all memblock might be expensive
  1212. * on some big memory systems. Hence enable this
  1213. * validation only with DEBUG_VM.
  1214. */
  1215. if (!IS_ENABLED(CONFIG_DEBUG_VM))
  1216. return;
  1217. for_each_mem_range(i, &start, &end) {
  1218. for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
  1219. ms = __pfn_to_section(PHYS_PFN(addr));
  1220. /*
  1221. * All memory ranges in the system at this point
  1222. * should have been marked as early sections.
  1223. */
  1224. WARN_ON(!early_section(ms));
  1225. /*
  1226. * Memory notifier mechanism here to prevent boot
  1227. * memory offlining depends on the fact that each
  1228. * early section memory on the system is initially
  1229. * online. Otherwise a given memory section which
  1230. * is already offline will be overlooked and can
  1231. * be removed completely. Call out such sections.
  1232. */
  1233. if (!online_section(ms))
  1234. pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
  1235. addr, addr + (1UL << PA_SECTION_SHIFT));
  1236. }
  1237. }
  1238. }
  1239. static int __init prevent_bootmem_remove_init(void)
  1240. {
  1241. int ret = 0;
  1242. if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
  1243. return ret;
  1244. validate_bootmem_online();
  1245. ret = register_memory_notifier(&prevent_bootmem_remove_nb);
  1246. if (ret)
  1247. pr_err("%s: Notifier registration failed %d\n", __func__, ret);
  1248. return ret;
  1249. }
  1250. early_initcall(prevent_bootmem_remove_init);
  1251. #endif
  1252. pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
  1253. {
  1254. if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
  1255. /*
  1256. * Break-before-make (BBM) is required for all user space mappings
  1257. * when the permission changes from executable to non-executable
  1258. * in cases where cpu is affected with errata #2645198.
  1259. */
  1260. if (pte_user_exec(ptep_get(ptep)))
  1261. return ptep_clear_flush(vma, addr, ptep);
  1262. }
  1263. return ptep_get_and_clear(vma->vm_mm, addr, ptep);
  1264. }
  1265. void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
  1266. pte_t old_pte, pte_t pte)
  1267. {
  1268. set_pte_at(vma->vm_mm, addr, ptep, pte);
  1269. }
  1270. /*
  1271. * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
  1272. * avoiding the possibility of conflicting TLB entries being allocated.
  1273. */
  1274. void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
  1275. {
  1276. typedef void (ttbr_replace_func)(phys_addr_t);
  1277. extern ttbr_replace_func idmap_cpu_replace_ttbr1;
  1278. ttbr_replace_func *replace_phys;
  1279. unsigned long daif;
  1280. /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
  1281. phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
  1282. if (cnp)
  1283. ttbr1 |= TTBR_CNP_BIT;
  1284. replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
  1285. cpu_install_idmap();
  1286. /*
  1287. * We really don't want to take *any* exceptions while TTBR1 is
  1288. * in the process of being replaced so mask everything.
  1289. */
  1290. daif = local_daif_save();
  1291. replace_phys(ttbr1);
  1292. local_daif_restore(daif);
  1293. cpu_uninstall_idmap();
  1294. }
  1295. #ifdef CONFIG_ARCH_HAS_PKEYS
  1296. int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
  1297. {
  1298. u64 new_por = POE_RXW;
  1299. u64 old_por;
  1300. u64 pkey_shift;
  1301. if (!system_supports_poe())
  1302. return -ENOSPC;
  1303. /*
  1304. * This code should only be called with valid 'pkey'
  1305. * values originating from in-kernel users. Complain
  1306. * if a bad value is observed.
  1307. */
  1308. if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
  1309. return -EINVAL;
  1310. /* Set the bits we need in POR: */
  1311. new_por = POE_RXW;
  1312. if (init_val & PKEY_DISABLE_WRITE)
  1313. new_por &= ~POE_W;
  1314. if (init_val & PKEY_DISABLE_ACCESS)
  1315. new_por &= ~POE_RW;
  1316. if (init_val & PKEY_DISABLE_READ)
  1317. new_por &= ~POE_R;
  1318. if (init_val & PKEY_DISABLE_EXECUTE)
  1319. new_por &= ~POE_X;
  1320. /* Shift the bits in to the correct place in POR for pkey: */
  1321. pkey_shift = pkey * POR_BITS_PER_PKEY;
  1322. new_por <<= pkey_shift;
  1323. /* Get old POR and mask off any old bits in place: */
  1324. old_por = read_sysreg_s(SYS_POR_EL0);
  1325. old_por &= ~(POE_MASK << pkey_shift);
  1326. /* Write old part along with new part: */
  1327. write_sysreg_s(old_por | new_por, SYS_POR_EL0);
  1328. return 0;
  1329. }
  1330. #endif