radix_pgtable.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Page table handling routines for radix page table.
  4. *
  5. * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  6. */
  7. #define pr_fmt(fmt) "radix-mmu: " fmt
  8. #include <linux/io.h>
  9. #include <linux/kernel.h>
  10. #include <linux/sched/mm.h>
  11. #include <linux/memblock.h>
  12. #include <linux/of.h>
  13. #include <linux/of_fdt.h>
  14. #include <linux/mm.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/string_helpers.h>
  17. #include <linux/memory.h>
  18. #include <linux/kfence.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/mmu_context.h>
  21. #include <asm/dma.h>
  22. #include <asm/machdep.h>
  23. #include <asm/mmu.h>
  24. #include <asm/firmware.h>
  25. #include <asm/powernv.h>
  26. #include <asm/sections.h>
  27. #include <asm/smp.h>
  28. #include <asm/trace.h>
  29. #include <asm/uaccess.h>
  30. #include <asm/ultravisor.h>
  31. #include <asm/set_memory.h>
  32. #include <asm/kfence.h>
  33. #include <trace/events/thp.h>
  34. #include <mm/mmu_decl.h>
  35. unsigned int mmu_base_pid;
  36. static __ref void *early_alloc_pgtable(unsigned long size, int nid,
  37. unsigned long region_start, unsigned long region_end)
  38. {
  39. phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
  40. phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
  41. void *ptr;
  42. if (region_start)
  43. min_addr = region_start;
  44. if (region_end)
  45. max_addr = region_end;
  46. ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
  47. if (!ptr)
  48. panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
  49. __func__, size, size, nid, &min_addr, &max_addr);
  50. return ptr;
  51. }
  52. /*
  53. * When allocating pud or pmd pointers, we allocate a complete page
  54. * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
  55. * is to ensure that the page obtained from the memblock allocator
  56. * can be completely used as page table page and can be freed
  57. * correctly when the page table entries are removed.
  58. */
  59. static int early_map_kernel_page(unsigned long ea, unsigned long pa,
  60. pgprot_t flags,
  61. unsigned int map_page_size,
  62. int nid,
  63. unsigned long region_start, unsigned long region_end)
  64. {
  65. unsigned long pfn = pa >> PAGE_SHIFT;
  66. pgd_t *pgdp;
  67. p4d_t *p4dp;
  68. pud_t *pudp;
  69. pmd_t *pmdp;
  70. pte_t *ptep;
  71. pgdp = pgd_offset_k(ea);
  72. p4dp = p4d_offset(pgdp, ea);
  73. if (p4d_none(*p4dp)) {
  74. pudp = early_alloc_pgtable(PAGE_SIZE, nid,
  75. region_start, region_end);
  76. p4d_populate(&init_mm, p4dp, pudp);
  77. }
  78. pudp = pud_offset(p4dp, ea);
  79. if (map_page_size == PUD_SIZE) {
  80. ptep = (pte_t *)pudp;
  81. goto set_the_pte;
  82. }
  83. if (pud_none(*pudp)) {
  84. pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
  85. region_end);
  86. pud_populate(&init_mm, pudp, pmdp);
  87. }
  88. pmdp = pmd_offset(pudp, ea);
  89. if (map_page_size == PMD_SIZE) {
  90. ptep = pmdp_ptep(pmdp);
  91. goto set_the_pte;
  92. }
  93. if (!pmd_present(*pmdp)) {
  94. ptep = early_alloc_pgtable(PAGE_SIZE, nid,
  95. region_start, region_end);
  96. pmd_populate_kernel(&init_mm, pmdp, ptep);
  97. }
  98. ptep = pte_offset_kernel(pmdp, ea);
  99. set_the_pte:
  100. set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
  101. asm volatile("ptesync": : :"memory");
  102. return 0;
  103. }
  104. /*
  105. * nid, region_start, and region_end are hints to try to place the page
  106. * table memory in the same node or region.
  107. */
  108. static int __map_kernel_page(unsigned long ea, unsigned long pa,
  109. pgprot_t flags,
  110. unsigned int map_page_size,
  111. int nid,
  112. unsigned long region_start, unsigned long region_end)
  113. {
  114. unsigned long pfn = pa >> PAGE_SHIFT;
  115. pgd_t *pgdp;
  116. p4d_t *p4dp;
  117. pud_t *pudp;
  118. pmd_t *pmdp;
  119. pte_t *ptep;
  120. /*
  121. * Make sure task size is correct as per the max adddr
  122. */
  123. BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
  124. #ifdef CONFIG_PPC_64K_PAGES
  125. BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
  126. #endif
  127. if (unlikely(!slab_is_available()))
  128. return early_map_kernel_page(ea, pa, flags, map_page_size,
  129. nid, region_start, region_end);
  130. /*
  131. * Should make page table allocation functions be able to take a
  132. * node, so we can place kernel page tables on the right nodes after
  133. * boot.
  134. */
  135. pgdp = pgd_offset_k(ea);
  136. p4dp = p4d_offset(pgdp, ea);
  137. pudp = pud_alloc(&init_mm, p4dp, ea);
  138. if (!pudp)
  139. return -ENOMEM;
  140. if (map_page_size == PUD_SIZE) {
  141. ptep = (pte_t *)pudp;
  142. goto set_the_pte;
  143. }
  144. pmdp = pmd_alloc(&init_mm, pudp, ea);
  145. if (!pmdp)
  146. return -ENOMEM;
  147. if (map_page_size == PMD_SIZE) {
  148. ptep = pmdp_ptep(pmdp);
  149. goto set_the_pte;
  150. }
  151. ptep = pte_alloc_kernel(pmdp, ea);
  152. if (!ptep)
  153. return -ENOMEM;
  154. set_the_pte:
  155. set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
  156. asm volatile("ptesync": : :"memory");
  157. return 0;
  158. }
  159. int radix__map_kernel_page(unsigned long ea, unsigned long pa,
  160. pgprot_t flags,
  161. unsigned int map_page_size)
  162. {
  163. return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
  164. }
  165. #ifdef CONFIG_STRICT_KERNEL_RWX
  166. static void radix__change_memory_range(unsigned long start, unsigned long end,
  167. unsigned long clear)
  168. {
  169. unsigned long idx;
  170. pgd_t *pgdp;
  171. p4d_t *p4dp;
  172. pud_t *pudp;
  173. pmd_t *pmdp;
  174. pte_t *ptep;
  175. start = ALIGN_DOWN(start, PAGE_SIZE);
  176. end = PAGE_ALIGN(end); // aligns up
  177. pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
  178. start, end, clear);
  179. for (idx = start; idx < end; idx += PAGE_SIZE) {
  180. pgdp = pgd_offset_k(idx);
  181. p4dp = p4d_offset(pgdp, idx);
  182. pudp = pud_alloc(&init_mm, p4dp, idx);
  183. if (!pudp)
  184. continue;
  185. if (pud_leaf(*pudp)) {
  186. ptep = (pte_t *)pudp;
  187. goto update_the_pte;
  188. }
  189. pmdp = pmd_alloc(&init_mm, pudp, idx);
  190. if (!pmdp)
  191. continue;
  192. if (pmd_leaf(*pmdp)) {
  193. ptep = pmdp_ptep(pmdp);
  194. goto update_the_pte;
  195. }
  196. ptep = pte_alloc_kernel(pmdp, idx);
  197. if (!ptep)
  198. continue;
  199. update_the_pte:
  200. radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
  201. }
  202. radix__flush_tlb_kernel_range(start, end);
  203. }
  204. void radix__mark_rodata_ro(void)
  205. {
  206. unsigned long start, end;
  207. start = (unsigned long)_stext;
  208. end = (unsigned long)__end_rodata;
  209. radix__change_memory_range(start, end, _PAGE_WRITE);
  210. for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
  211. end = start + PAGE_SIZE;
  212. if (overlaps_interrupt_vector_text(start, end))
  213. radix__change_memory_range(start, end, _PAGE_WRITE);
  214. else
  215. break;
  216. }
  217. }
  218. void radix__mark_initmem_nx(void)
  219. {
  220. unsigned long start = (unsigned long)__init_begin;
  221. unsigned long end = (unsigned long)__init_end;
  222. radix__change_memory_range(start, end, _PAGE_EXEC);
  223. }
  224. #endif /* CONFIG_STRICT_KERNEL_RWX */
  225. static inline void __meminit
  226. print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
  227. {
  228. char buf[10];
  229. if (end <= start)
  230. return;
  231. string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
  232. pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
  233. exec ? " (exec)" : "");
  234. }
  235. static unsigned long next_boundary(unsigned long addr, unsigned long end)
  236. {
  237. #ifdef CONFIG_STRICT_KERNEL_RWX
  238. unsigned long stext_phys;
  239. stext_phys = __pa_symbol(_stext);
  240. // Relocatable kernel running at non-zero real address
  241. if (stext_phys != 0) {
  242. // The end of interrupts code at zero is a rodata boundary
  243. unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
  244. if (addr < end_intr)
  245. return end_intr;
  246. // Start of relocated kernel text is a rodata boundary
  247. if (addr < stext_phys)
  248. return stext_phys;
  249. }
  250. if (addr < __pa_symbol(__srwx_boundary))
  251. return __pa_symbol(__srwx_boundary);
  252. #endif
  253. return end;
  254. }
  255. static int __meminit create_physical_mapping(unsigned long start,
  256. unsigned long end,
  257. int nid, pgprot_t _prot,
  258. unsigned long mapping_sz_limit)
  259. {
  260. unsigned long vaddr, addr, mapping_size = 0;
  261. bool prev_exec, exec = false;
  262. pgprot_t prot;
  263. int psize;
  264. unsigned long max_mapping_size = memory_block_size;
  265. if (mapping_sz_limit < max_mapping_size)
  266. max_mapping_size = mapping_sz_limit;
  267. if (debug_pagealloc_enabled())
  268. max_mapping_size = PAGE_SIZE;
  269. start = ALIGN(start, PAGE_SIZE);
  270. end = ALIGN_DOWN(end, PAGE_SIZE);
  271. for (addr = start; addr < end; addr += mapping_size) {
  272. unsigned long gap, previous_size;
  273. int rc;
  274. gap = next_boundary(addr, end) - addr;
  275. if (gap > max_mapping_size)
  276. gap = max_mapping_size;
  277. previous_size = mapping_size;
  278. prev_exec = exec;
  279. if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
  280. mmu_psize_defs[MMU_PAGE_1G].shift) {
  281. mapping_size = PUD_SIZE;
  282. psize = MMU_PAGE_1G;
  283. } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
  284. mmu_psize_defs[MMU_PAGE_2M].shift) {
  285. mapping_size = PMD_SIZE;
  286. psize = MMU_PAGE_2M;
  287. } else {
  288. mapping_size = PAGE_SIZE;
  289. psize = mmu_virtual_psize;
  290. }
  291. vaddr = (unsigned long)__va(addr);
  292. if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
  293. overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
  294. prot = PAGE_KERNEL_X;
  295. exec = true;
  296. } else {
  297. prot = _prot;
  298. exec = false;
  299. }
  300. if (mapping_size != previous_size || exec != prev_exec) {
  301. print_mapping(start, addr, previous_size, prev_exec);
  302. start = addr;
  303. }
  304. rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
  305. if (rc)
  306. return rc;
  307. update_page_count(psize, 1);
  308. }
  309. print_mapping(start, addr, mapping_size, exec);
  310. return 0;
  311. }
  312. #ifdef CONFIG_KFENCE
  313. static bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
  314. static int __init parse_kfence_early_init(char *arg)
  315. {
  316. int val;
  317. if (get_option(&arg, &val))
  318. kfence_early_init = !!val;
  319. return 0;
  320. }
  321. early_param("kfence.sample_interval", parse_kfence_early_init);
  322. static inline phys_addr_t alloc_kfence_pool(void)
  323. {
  324. phys_addr_t kfence_pool;
  325. /*
  326. * TODO: Support to enable KFENCE after bootup depends on the ability to
  327. * split page table mappings. As such support is not currently
  328. * implemented for radix pagetables, support enabling KFENCE
  329. * only at system startup for now.
  330. *
  331. * After support for splitting mappings is available on radix,
  332. * alloc_kfence_pool() & map_kfence_pool() can be dropped and
  333. * mapping for __kfence_pool memory can be
  334. * split during arch_kfence_init_pool().
  335. */
  336. if (!kfence_early_init)
  337. goto no_kfence;
  338. kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
  339. if (!kfence_pool)
  340. goto no_kfence;
  341. memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
  342. return kfence_pool;
  343. no_kfence:
  344. disable_kfence();
  345. return 0;
  346. }
  347. static inline void map_kfence_pool(phys_addr_t kfence_pool)
  348. {
  349. if (!kfence_pool)
  350. return;
  351. if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
  352. -1, PAGE_KERNEL, PAGE_SIZE))
  353. goto err;
  354. memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
  355. __kfence_pool = __va(kfence_pool);
  356. return;
  357. err:
  358. memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
  359. disable_kfence();
  360. }
  361. #else
  362. static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
  363. static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
  364. #endif
  365. static void __init radix_init_pgtable(void)
  366. {
  367. phys_addr_t kfence_pool;
  368. unsigned long rts_field;
  369. phys_addr_t start, end;
  370. u64 i;
  371. /* We don't support slb for radix */
  372. slb_set_size(0);
  373. kfence_pool = alloc_kfence_pool();
  374. /*
  375. * Create the linear mapping
  376. */
  377. for_each_mem_range(i, &start, &end) {
  378. /*
  379. * The memblock allocator is up at this point, so the
  380. * page tables will be allocated within the range. No
  381. * need or a node (which we don't have yet).
  382. */
  383. if (end >= RADIX_VMALLOC_START) {
  384. pr_warn("Outside the supported range\n");
  385. continue;
  386. }
  387. WARN_ON(create_physical_mapping(start, end,
  388. -1, PAGE_KERNEL, ~0UL));
  389. }
  390. map_kfence_pool(kfence_pool);
  391. if (!cpu_has_feature(CPU_FTR_HVMODE) &&
  392. cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
  393. /*
  394. * Older versions of KVM on these machines prefer if the
  395. * guest only uses the low 19 PID bits.
  396. */
  397. mmu_pid_bits = 19;
  398. }
  399. mmu_base_pid = 1;
  400. /*
  401. * Allocate Partition table and process table for the
  402. * host.
  403. */
  404. BUG_ON(PRTB_SIZE_SHIFT > 36);
  405. process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
  406. /*
  407. * Fill in the process table.
  408. */
  409. rts_field = radix__get_tree_size();
  410. process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
  411. /*
  412. * The init_mm context is given the first available (non-zero) PID,
  413. * which is the "guard PID" and contains no page table. PIDR should
  414. * never be set to zero because that duplicates the kernel address
  415. * space at the 0x0... offset (quadrant 0)!
  416. *
  417. * An arbitrary PID that may later be allocated by the PID allocator
  418. * for userspace processes must not be used either, because that
  419. * would cause stale user mappings for that PID on CPUs outside of
  420. * the TLB invalidation scheme (because it won't be in mm_cpumask).
  421. *
  422. * So permanently carve out one PID for the purpose of a guard PID.
  423. */
  424. init_mm.context.id = mmu_base_pid;
  425. mmu_base_pid++;
  426. }
  427. static void __init radix_init_partition_table(void)
  428. {
  429. unsigned long rts_field, dw0, dw1;
  430. mmu_partition_table_init();
  431. rts_field = radix__get_tree_size();
  432. dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
  433. dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
  434. mmu_partition_table_set_entry(0, dw0, dw1, false);
  435. pr_info("Initializing Radix MMU\n");
  436. }
  437. static int __init get_idx_from_shift(unsigned int shift)
  438. {
  439. int idx = -1;
  440. switch (shift) {
  441. case 0xc:
  442. idx = MMU_PAGE_4K;
  443. break;
  444. case 0x10:
  445. idx = MMU_PAGE_64K;
  446. break;
  447. case 0x15:
  448. idx = MMU_PAGE_2M;
  449. break;
  450. case 0x1e:
  451. idx = MMU_PAGE_1G;
  452. break;
  453. }
  454. return idx;
  455. }
  456. static int __init radix_dt_scan_page_sizes(unsigned long node,
  457. const char *uname, int depth,
  458. void *data)
  459. {
  460. int size = 0;
  461. int shift, idx;
  462. unsigned int ap;
  463. const __be32 *prop;
  464. const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
  465. /* We are scanning "cpu" nodes only */
  466. if (type == NULL || strcmp(type, "cpu") != 0)
  467. return 0;
  468. /* Grab page size encodings */
  469. prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
  470. if (!prop)
  471. return 0;
  472. pr_info("Page sizes from device-tree:\n");
  473. for (; size >= 4; size -= 4, ++prop) {
  474. struct mmu_psize_def *def;
  475. /* top 3 bit is AP encoding */
  476. shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
  477. ap = be32_to_cpu(prop[0]) >> 29;
  478. pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
  479. idx = get_idx_from_shift(shift);
  480. if (idx < 0)
  481. continue;
  482. def = &mmu_psize_defs[idx];
  483. def->shift = shift;
  484. def->ap = ap;
  485. def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
  486. }
  487. /* needed ? */
  488. cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
  489. return 1;
  490. }
  491. void __init radix__early_init_devtree(void)
  492. {
  493. int rc;
  494. /*
  495. * Try to find the available page sizes in the device-tree
  496. */
  497. rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
  498. if (!rc) {
  499. /*
  500. * No page size details found in device tree.
  501. * Let's assume we have page 4k and 64k support
  502. */
  503. mmu_psize_defs[MMU_PAGE_4K].shift = 12;
  504. mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
  505. mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
  506. psize_to_rpti_pgsize(MMU_PAGE_4K);
  507. mmu_psize_defs[MMU_PAGE_64K].shift = 16;
  508. mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
  509. mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
  510. psize_to_rpti_pgsize(MMU_PAGE_64K);
  511. }
  512. return;
  513. }
  514. void __init radix__early_init_mmu(void)
  515. {
  516. unsigned long lpcr;
  517. #ifdef CONFIG_PPC_64S_HASH_MMU
  518. #ifdef CONFIG_PPC_64K_PAGES
  519. /* PAGE_SIZE mappings */
  520. mmu_virtual_psize = MMU_PAGE_64K;
  521. #else
  522. mmu_virtual_psize = MMU_PAGE_4K;
  523. #endif
  524. #endif
  525. /*
  526. * initialize page table size
  527. */
  528. __pte_index_size = RADIX_PTE_INDEX_SIZE;
  529. __pmd_index_size = RADIX_PMD_INDEX_SIZE;
  530. __pud_index_size = RADIX_PUD_INDEX_SIZE;
  531. __pgd_index_size = RADIX_PGD_INDEX_SIZE;
  532. __pud_cache_index = RADIX_PUD_INDEX_SIZE;
  533. __pte_table_size = RADIX_PTE_TABLE_SIZE;
  534. __pmd_table_size = RADIX_PMD_TABLE_SIZE;
  535. __pud_table_size = RADIX_PUD_TABLE_SIZE;
  536. __pgd_table_size = RADIX_PGD_TABLE_SIZE;
  537. __pmd_val_bits = RADIX_PMD_VAL_BITS;
  538. __pud_val_bits = RADIX_PUD_VAL_BITS;
  539. __pgd_val_bits = RADIX_PGD_VAL_BITS;
  540. __kernel_virt_start = RADIX_KERN_VIRT_START;
  541. __vmalloc_start = RADIX_VMALLOC_START;
  542. __vmalloc_end = RADIX_VMALLOC_END;
  543. __kernel_io_start = RADIX_KERN_IO_START;
  544. __kernel_io_end = RADIX_KERN_IO_END;
  545. vmemmap = (struct page *)RADIX_VMEMMAP_START;
  546. ioremap_bot = IOREMAP_BASE;
  547. #ifdef CONFIG_PCI
  548. pci_io_base = ISA_IO_BASE;
  549. #endif
  550. __pte_frag_nr = RADIX_PTE_FRAG_NR;
  551. __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
  552. __pmd_frag_nr = RADIX_PMD_FRAG_NR;
  553. __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
  554. radix_init_pgtable();
  555. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  556. lpcr = mfspr(SPRN_LPCR);
  557. mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
  558. radix_init_partition_table();
  559. } else {
  560. radix_init_pseries();
  561. }
  562. memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
  563. /* Switch to the guard PID before turning on MMU */
  564. radix__switch_mmu_context(NULL, &init_mm);
  565. tlbiel_all();
  566. }
  567. void radix__early_init_mmu_secondary(void)
  568. {
  569. unsigned long lpcr;
  570. /*
  571. * update partition table control register and UPRT
  572. */
  573. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  574. lpcr = mfspr(SPRN_LPCR);
  575. mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
  576. set_ptcr_when_no_uv(__pa(partition_tb) |
  577. (PATB_SIZE_SHIFT - 12));
  578. }
  579. radix__switch_mmu_context(NULL, &init_mm);
  580. tlbiel_all();
  581. /* Make sure userspace can't change the AMR */
  582. mtspr(SPRN_UAMOR, 0);
  583. }
  584. /* Called during kexec sequence with MMU off */
  585. notrace void radix__mmu_cleanup_all(void)
  586. {
  587. unsigned long lpcr;
  588. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  589. lpcr = mfspr(SPRN_LPCR);
  590. mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
  591. set_ptcr_when_no_uv(0);
  592. powernv_set_nmmu_ptcr(0);
  593. radix__flush_tlb_all();
  594. }
  595. }
  596. #ifdef CONFIG_MEMORY_HOTPLUG
  597. static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
  598. {
  599. pte_t *pte;
  600. int i;
  601. for (i = 0; i < PTRS_PER_PTE; i++) {
  602. pte = pte_start + i;
  603. if (!pte_none(*pte))
  604. return;
  605. }
  606. pte_free_kernel(&init_mm, pte_start);
  607. pmd_clear(pmd);
  608. }
  609. static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
  610. {
  611. pmd_t *pmd;
  612. int i;
  613. for (i = 0; i < PTRS_PER_PMD; i++) {
  614. pmd = pmd_start + i;
  615. if (!pmd_none(*pmd))
  616. return;
  617. }
  618. pmd_free(&init_mm, pmd_start);
  619. pud_clear(pud);
  620. }
  621. static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
  622. {
  623. pud_t *pud;
  624. int i;
  625. for (i = 0; i < PTRS_PER_PUD; i++) {
  626. pud = pud_start + i;
  627. if (!pud_none(*pud))
  628. return;
  629. }
  630. pud_free(&init_mm, pud_start);
  631. p4d_clear(p4d);
  632. }
  633. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  634. static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
  635. {
  636. unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
  637. return !vmemmap_populated(start, PMD_SIZE);
  638. }
  639. static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
  640. {
  641. unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
  642. return !vmemmap_populated(start, PAGE_SIZE);
  643. }
  644. #endif
  645. static void __meminit free_vmemmap_pages(struct page *page,
  646. struct vmem_altmap *altmap,
  647. int order)
  648. {
  649. unsigned int nr_pages = 1 << order;
  650. if (altmap) {
  651. unsigned long alt_start, alt_end;
  652. unsigned long base_pfn = page_to_pfn(page);
  653. /*
  654. * with 2M vmemmap mmaping we can have things setup
  655. * such that even though atlmap is specified we never
  656. * used altmap.
  657. */
  658. alt_start = altmap->base_pfn;
  659. alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
  660. if (base_pfn >= alt_start && base_pfn < alt_end) {
  661. vmem_altmap_free(altmap, nr_pages);
  662. return;
  663. }
  664. }
  665. if (PageReserved(page)) {
  666. /* allocated from memblock */
  667. while (nr_pages--)
  668. free_reserved_page(page++);
  669. } else
  670. free_pages((unsigned long)page_address(page), order);
  671. }
  672. static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
  673. unsigned long end, bool direct,
  674. struct vmem_altmap *altmap)
  675. {
  676. unsigned long next, pages = 0;
  677. pte_t *pte;
  678. pte = pte_start + pte_index(addr);
  679. for (; addr < end; addr = next, pte++) {
  680. next = (addr + PAGE_SIZE) & PAGE_MASK;
  681. if (next > end)
  682. next = end;
  683. if (!pte_present(*pte))
  684. continue;
  685. if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
  686. if (!direct)
  687. free_vmemmap_pages(pte_page(*pte), altmap, 0);
  688. pte_clear(&init_mm, addr, pte);
  689. pages++;
  690. }
  691. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  692. else if (!direct && vmemmap_page_is_unused(addr, next)) {
  693. free_vmemmap_pages(pte_page(*pte), altmap, 0);
  694. pte_clear(&init_mm, addr, pte);
  695. }
  696. #endif
  697. }
  698. if (direct)
  699. update_page_count(mmu_virtual_psize, -pages);
  700. }
  701. static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
  702. unsigned long end, bool direct,
  703. struct vmem_altmap *altmap)
  704. {
  705. unsigned long next, pages = 0;
  706. pte_t *pte_base;
  707. pmd_t *pmd;
  708. pmd = pmd_start + pmd_index(addr);
  709. for (; addr < end; addr = next, pmd++) {
  710. next = pmd_addr_end(addr, end);
  711. if (!pmd_present(*pmd))
  712. continue;
  713. if (pmd_leaf(*pmd)) {
  714. if (IS_ALIGNED(addr, PMD_SIZE) &&
  715. IS_ALIGNED(next, PMD_SIZE)) {
  716. if (!direct)
  717. free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
  718. pte_clear(&init_mm, addr, (pte_t *)pmd);
  719. pages++;
  720. }
  721. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  722. else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
  723. free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
  724. pte_clear(&init_mm, addr, (pte_t *)pmd);
  725. }
  726. #endif
  727. continue;
  728. }
  729. pte_base = (pte_t *)pmd_page_vaddr(*pmd);
  730. remove_pte_table(pte_base, addr, next, direct, altmap);
  731. free_pte_table(pte_base, pmd);
  732. }
  733. if (direct)
  734. update_page_count(MMU_PAGE_2M, -pages);
  735. }
  736. static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
  737. unsigned long end, bool direct,
  738. struct vmem_altmap *altmap)
  739. {
  740. unsigned long next, pages = 0;
  741. pmd_t *pmd_base;
  742. pud_t *pud;
  743. pud = pud_start + pud_index(addr);
  744. for (; addr < end; addr = next, pud++) {
  745. next = pud_addr_end(addr, end);
  746. if (!pud_present(*pud))
  747. continue;
  748. if (pud_leaf(*pud)) {
  749. if (!IS_ALIGNED(addr, PUD_SIZE) ||
  750. !IS_ALIGNED(next, PUD_SIZE)) {
  751. WARN_ONCE(1, "%s: unaligned range\n", __func__);
  752. continue;
  753. }
  754. pte_clear(&init_mm, addr, (pte_t *)pud);
  755. pages++;
  756. continue;
  757. }
  758. pmd_base = pud_pgtable(*pud);
  759. remove_pmd_table(pmd_base, addr, next, direct, altmap);
  760. free_pmd_table(pmd_base, pud);
  761. }
  762. if (direct)
  763. update_page_count(MMU_PAGE_1G, -pages);
  764. }
  765. static void __meminit
  766. remove_pagetable(unsigned long start, unsigned long end, bool direct,
  767. struct vmem_altmap *altmap)
  768. {
  769. unsigned long addr, next;
  770. pud_t *pud_base;
  771. pgd_t *pgd;
  772. p4d_t *p4d;
  773. spin_lock(&init_mm.page_table_lock);
  774. for (addr = start; addr < end; addr = next) {
  775. next = pgd_addr_end(addr, end);
  776. pgd = pgd_offset_k(addr);
  777. p4d = p4d_offset(pgd, addr);
  778. if (!p4d_present(*p4d))
  779. continue;
  780. if (p4d_leaf(*p4d)) {
  781. if (!IS_ALIGNED(addr, P4D_SIZE) ||
  782. !IS_ALIGNED(next, P4D_SIZE)) {
  783. WARN_ONCE(1, "%s: unaligned range\n", __func__);
  784. continue;
  785. }
  786. pte_clear(&init_mm, addr, (pte_t *)pgd);
  787. continue;
  788. }
  789. pud_base = p4d_pgtable(*p4d);
  790. remove_pud_table(pud_base, addr, next, direct, altmap);
  791. free_pud_table(pud_base, p4d);
  792. }
  793. spin_unlock(&init_mm.page_table_lock);
  794. radix__flush_tlb_kernel_range(start, end);
  795. }
  796. int __meminit radix__create_section_mapping(unsigned long start,
  797. unsigned long end, int nid,
  798. pgprot_t prot)
  799. {
  800. if (end >= RADIX_VMALLOC_START) {
  801. pr_warn("Outside the supported range\n");
  802. return -1;
  803. }
  804. return create_physical_mapping(__pa(start), __pa(end),
  805. nid, prot, ~0UL);
  806. }
  807. int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
  808. {
  809. remove_pagetable(start, end, true, NULL);
  810. return 0;
  811. }
  812. #endif /* CONFIG_MEMORY_HOTPLUG */
  813. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  814. static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
  815. pgprot_t flags, unsigned int map_page_size,
  816. int nid)
  817. {
  818. return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
  819. }
  820. int __meminit radix__vmemmap_create_mapping(unsigned long start,
  821. unsigned long page_size,
  822. unsigned long phys)
  823. {
  824. /* Create a PTE encoding */
  825. int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
  826. int ret;
  827. if ((start + page_size) >= RADIX_VMEMMAP_END) {
  828. pr_warn("Outside the supported range\n");
  829. return -1;
  830. }
  831. ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
  832. BUG_ON(ret);
  833. return 0;
  834. }
  835. bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
  836. {
  837. if (radix_enabled())
  838. return __vmemmap_can_optimize(altmap, pgmap);
  839. return false;
  840. }
  841. int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
  842. unsigned long addr, unsigned long next)
  843. {
  844. int large = pmd_leaf(*pmdp);
  845. if (large)
  846. vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
  847. return large;
  848. }
  849. void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
  850. unsigned long addr, unsigned long next)
  851. {
  852. pte_t entry;
  853. pte_t *ptep = pmdp_ptep(pmdp);
  854. VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
  855. entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
  856. set_pte_at(&init_mm, addr, ptep, entry);
  857. asm volatile("ptesync": : :"memory");
  858. vmemmap_verify(ptep, node, addr, next);
  859. }
  860. static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
  861. int node,
  862. struct vmem_altmap *altmap,
  863. struct page *reuse)
  864. {
  865. pte_t *pte = pte_offset_kernel(pmdp, addr);
  866. if (pte_none(*pte)) {
  867. pte_t entry;
  868. void *p;
  869. if (!reuse) {
  870. /*
  871. * make sure we don't create altmap mappings
  872. * covering things outside the device.
  873. */
  874. if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
  875. altmap = NULL;
  876. p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
  877. if (!p && altmap)
  878. p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
  879. if (!p)
  880. return NULL;
  881. pr_debug("PAGE_SIZE vmemmap mapping\n");
  882. } else {
  883. /*
  884. * When a PTE/PMD entry is freed from the init_mm
  885. * there's a free_pages() call to this page allocated
  886. * above. Thus this get_page() is paired with the
  887. * put_page_testzero() on the freeing path.
  888. * This can only called by certain ZONE_DEVICE path,
  889. * and through vmemmap_populate_compound_pages() when
  890. * slab is available.
  891. */
  892. get_page(reuse);
  893. p = page_to_virt(reuse);
  894. pr_debug("Tail page reuse vmemmap mapping\n");
  895. }
  896. VM_BUG_ON(!PAGE_ALIGNED(addr));
  897. entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
  898. set_pte_at(&init_mm, addr, pte, entry);
  899. asm volatile("ptesync": : :"memory");
  900. }
  901. return pte;
  902. }
  903. static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
  904. unsigned long address)
  905. {
  906. pud_t *pud;
  907. /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
  908. if (unlikely(p4d_none(*p4dp))) {
  909. if (unlikely(!slab_is_available())) {
  910. pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
  911. p4d_populate(&init_mm, p4dp, pud);
  912. /* go to the pud_offset */
  913. } else
  914. return pud_alloc(&init_mm, p4dp, address);
  915. }
  916. return pud_offset(p4dp, address);
  917. }
  918. static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
  919. unsigned long address)
  920. {
  921. pmd_t *pmd;
  922. /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
  923. if (unlikely(pud_none(*pudp))) {
  924. if (unlikely(!slab_is_available())) {
  925. pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
  926. pud_populate(&init_mm, pudp, pmd);
  927. } else
  928. return pmd_alloc(&init_mm, pudp, address);
  929. }
  930. return pmd_offset(pudp, address);
  931. }
  932. static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
  933. unsigned long address)
  934. {
  935. pte_t *pte;
  936. /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
  937. if (unlikely(pmd_none(*pmdp))) {
  938. if (unlikely(!slab_is_available())) {
  939. pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
  940. pmd_populate(&init_mm, pmdp, pte);
  941. } else
  942. return pte_alloc_kernel(pmdp, address);
  943. }
  944. return pte_offset_kernel(pmdp, address);
  945. }
  946. int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
  947. struct vmem_altmap *altmap)
  948. {
  949. unsigned long addr;
  950. unsigned long next;
  951. pgd_t *pgd;
  952. p4d_t *p4d;
  953. pud_t *pud;
  954. pmd_t *pmd;
  955. pte_t *pte;
  956. for (addr = start; addr < end; addr = next) {
  957. next = pmd_addr_end(addr, end);
  958. pgd = pgd_offset_k(addr);
  959. p4d = p4d_offset(pgd, addr);
  960. pud = vmemmap_pud_alloc(p4d, node, addr);
  961. if (!pud)
  962. return -ENOMEM;
  963. pmd = vmemmap_pmd_alloc(pud, node, addr);
  964. if (!pmd)
  965. return -ENOMEM;
  966. if (pmd_none(READ_ONCE(*pmd))) {
  967. void *p;
  968. /*
  969. * keep it simple by checking addr PMD_SIZE alignment
  970. * and verifying the device boundary condition.
  971. * For us to use a pmd mapping, both addr and pfn should
  972. * be aligned. We skip if addr is not aligned and for
  973. * pfn we hope we have extra area in the altmap that
  974. * can help to find an aligned block. This can result
  975. * in altmap block allocation failures, in which case
  976. * we fallback to RAM for vmemmap allocation.
  977. */
  978. if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
  979. altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
  980. /*
  981. * make sure we don't create altmap mappings
  982. * covering things outside the device.
  983. */
  984. goto base_mapping;
  985. }
  986. p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
  987. if (p) {
  988. vmemmap_set_pmd(pmd, p, node, addr, next);
  989. pr_debug("PMD_SIZE vmemmap mapping\n");
  990. continue;
  991. } else if (altmap) {
  992. /*
  993. * A vmemmap block allocation can fail due to
  994. * alignment requirements and we trying to align
  995. * things aggressively there by running out of
  996. * space. Try base mapping on failure.
  997. */
  998. goto base_mapping;
  999. }
  1000. } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
  1001. /*
  1002. * If a huge mapping exist due to early call to
  1003. * vmemmap_populate, let's try to use that.
  1004. */
  1005. continue;
  1006. }
  1007. base_mapping:
  1008. /*
  1009. * Not able allocate higher order memory to back memmap
  1010. * or we found a pointer to pte page. Allocate base page
  1011. * size vmemmap
  1012. */
  1013. pte = vmemmap_pte_alloc(pmd, node, addr);
  1014. if (!pte)
  1015. return -ENOMEM;
  1016. pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
  1017. if (!pte)
  1018. return -ENOMEM;
  1019. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  1020. next = addr + PAGE_SIZE;
  1021. }
  1022. return 0;
  1023. }
  1024. static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
  1025. struct vmem_altmap *altmap,
  1026. struct page *reuse)
  1027. {
  1028. pgd_t *pgd;
  1029. p4d_t *p4d;
  1030. pud_t *pud;
  1031. pmd_t *pmd;
  1032. pte_t *pte;
  1033. pgd = pgd_offset_k(addr);
  1034. p4d = p4d_offset(pgd, addr);
  1035. pud = vmemmap_pud_alloc(p4d, node, addr);
  1036. if (!pud)
  1037. return NULL;
  1038. pmd = vmemmap_pmd_alloc(pud, node, addr);
  1039. if (!pmd)
  1040. return NULL;
  1041. if (pmd_leaf(*pmd))
  1042. /*
  1043. * The second page is mapped as a hugepage due to a nearby request.
  1044. * Force our mapping to page size without deduplication
  1045. */
  1046. return NULL;
  1047. pte = vmemmap_pte_alloc(pmd, node, addr);
  1048. if (!pte)
  1049. return NULL;
  1050. radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
  1051. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  1052. return pte;
  1053. }
  1054. static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
  1055. unsigned long pfn_offset, int node)
  1056. {
  1057. pgd_t *pgd;
  1058. p4d_t *p4d;
  1059. pud_t *pud;
  1060. pmd_t *pmd;
  1061. pte_t *pte;
  1062. unsigned long map_addr;
  1063. /* the second vmemmap page which we use for duplication */
  1064. map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
  1065. pgd = pgd_offset_k(map_addr);
  1066. p4d = p4d_offset(pgd, map_addr);
  1067. pud = vmemmap_pud_alloc(p4d, node, map_addr);
  1068. if (!pud)
  1069. return NULL;
  1070. pmd = vmemmap_pmd_alloc(pud, node, map_addr);
  1071. if (!pmd)
  1072. return NULL;
  1073. if (pmd_leaf(*pmd))
  1074. /*
  1075. * The second page is mapped as a hugepage due to a nearby request.
  1076. * Force our mapping to page size without deduplication
  1077. */
  1078. return NULL;
  1079. pte = vmemmap_pte_alloc(pmd, node, map_addr);
  1080. if (!pte)
  1081. return NULL;
  1082. /*
  1083. * Check if there exist a mapping to the left
  1084. */
  1085. if (pte_none(*pte)) {
  1086. /*
  1087. * Populate the head page vmemmap page.
  1088. * It can fall in different pmd, hence
  1089. * vmemmap_populate_address()
  1090. */
  1091. pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
  1092. if (!pte)
  1093. return NULL;
  1094. /*
  1095. * Populate the tail pages vmemmap page
  1096. */
  1097. pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
  1098. if (!pte)
  1099. return NULL;
  1100. vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
  1101. return pte;
  1102. }
  1103. return pte;
  1104. }
  1105. int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
  1106. unsigned long start,
  1107. unsigned long end, int node,
  1108. struct dev_pagemap *pgmap)
  1109. {
  1110. /*
  1111. * we want to map things as base page size mapping so that
  1112. * we can save space in vmemmap. We could have huge mapping
  1113. * covering out both edges.
  1114. */
  1115. unsigned long addr;
  1116. unsigned long addr_pfn = start_pfn;
  1117. unsigned long next;
  1118. pgd_t *pgd;
  1119. p4d_t *p4d;
  1120. pud_t *pud;
  1121. pmd_t *pmd;
  1122. pte_t *pte;
  1123. for (addr = start; addr < end; addr = next) {
  1124. pgd = pgd_offset_k(addr);
  1125. p4d = p4d_offset(pgd, addr);
  1126. pud = vmemmap_pud_alloc(p4d, node, addr);
  1127. if (!pud)
  1128. return -ENOMEM;
  1129. pmd = vmemmap_pmd_alloc(pud, node, addr);
  1130. if (!pmd)
  1131. return -ENOMEM;
  1132. if (pmd_leaf(READ_ONCE(*pmd))) {
  1133. /* existing huge mapping. Skip the range */
  1134. addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
  1135. next = pmd_addr_end(addr, end);
  1136. continue;
  1137. }
  1138. pte = vmemmap_pte_alloc(pmd, node, addr);
  1139. if (!pte)
  1140. return -ENOMEM;
  1141. if (!pte_none(*pte)) {
  1142. /*
  1143. * This could be because we already have a compound
  1144. * page whose VMEMMAP_RESERVE_NR pages were mapped and
  1145. * this request fall in those pages.
  1146. */
  1147. addr_pfn += 1;
  1148. next = addr + PAGE_SIZE;
  1149. continue;
  1150. } else {
  1151. unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
  1152. unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
  1153. pte_t *tail_page_pte;
  1154. /*
  1155. * if the address is aligned to huge page size it is the
  1156. * head mapping.
  1157. */
  1158. if (pfn_offset == 0) {
  1159. /* Populate the head page vmemmap page */
  1160. pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
  1161. if (!pte)
  1162. return -ENOMEM;
  1163. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  1164. /*
  1165. * Populate the tail pages vmemmap page
  1166. * It can fall in different pmd, hence
  1167. * vmemmap_populate_address()
  1168. */
  1169. pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
  1170. if (!pte)
  1171. return -ENOMEM;
  1172. addr_pfn += 2;
  1173. next = addr + 2 * PAGE_SIZE;
  1174. continue;
  1175. }
  1176. /*
  1177. * get the 2nd mapping details
  1178. * Also create it if that doesn't exist
  1179. */
  1180. tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
  1181. if (!tail_page_pte) {
  1182. pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
  1183. if (!pte)
  1184. return -ENOMEM;
  1185. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  1186. addr_pfn += 1;
  1187. next = addr + PAGE_SIZE;
  1188. continue;
  1189. }
  1190. pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
  1191. if (!pte)
  1192. return -ENOMEM;
  1193. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  1194. addr_pfn += 1;
  1195. next = addr + PAGE_SIZE;
  1196. continue;
  1197. }
  1198. }
  1199. return 0;
  1200. }
  1201. #ifdef CONFIG_MEMORY_HOTPLUG
  1202. void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
  1203. {
  1204. remove_pagetable(start, start + page_size, true, NULL);
  1205. }
  1206. void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
  1207. struct vmem_altmap *altmap)
  1208. {
  1209. remove_pagetable(start, end, false, altmap);
  1210. }
  1211. #endif
  1212. #endif
  1213. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1214. unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
  1215. pmd_t *pmdp, unsigned long clr,
  1216. unsigned long set)
  1217. {
  1218. unsigned long old;
  1219. #ifdef CONFIG_DEBUG_VM
  1220. WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
  1221. assert_spin_locked(pmd_lockptr(mm, pmdp));
  1222. #endif
  1223. old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
  1224. trace_hugepage_update_pmd(addr, old, clr, set);
  1225. return old;
  1226. }
  1227. unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
  1228. pud_t *pudp, unsigned long clr,
  1229. unsigned long set)
  1230. {
  1231. unsigned long old;
  1232. #ifdef CONFIG_DEBUG_VM
  1233. WARN_ON(!pud_devmap(*pudp));
  1234. assert_spin_locked(pud_lockptr(mm, pudp));
  1235. #endif
  1236. old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
  1237. trace_hugepage_update_pud(addr, old, clr, set);
  1238. return old;
  1239. }
  1240. pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
  1241. pmd_t *pmdp)
  1242. {
  1243. pmd_t pmd;
  1244. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1245. VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
  1246. VM_BUG_ON(pmd_devmap(*pmdp));
  1247. /*
  1248. * khugepaged calls this for normal pmd
  1249. */
  1250. pmd = *pmdp;
  1251. pmd_clear(pmdp);
  1252. radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
  1253. return pmd;
  1254. }
  1255. /*
  1256. * For us pgtable_t is pte_t *. Inorder to save the deposisted
  1257. * page table, we consider the allocated page table as a list
  1258. * head. On withdraw we need to make sure we zero out the used
  1259. * list_head memory area.
  1260. */
  1261. void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  1262. pgtable_t pgtable)
  1263. {
  1264. struct list_head *lh = (struct list_head *) pgtable;
  1265. assert_spin_locked(pmd_lockptr(mm, pmdp));
  1266. /* FIFO */
  1267. if (!pmd_huge_pte(mm, pmdp))
  1268. INIT_LIST_HEAD(lh);
  1269. else
  1270. list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
  1271. pmd_huge_pte(mm, pmdp) = pgtable;
  1272. }
  1273. pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  1274. {
  1275. pte_t *ptep;
  1276. pgtable_t pgtable;
  1277. struct list_head *lh;
  1278. assert_spin_locked(pmd_lockptr(mm, pmdp));
  1279. /* FIFO */
  1280. pgtable = pmd_huge_pte(mm, pmdp);
  1281. lh = (struct list_head *) pgtable;
  1282. if (list_empty(lh))
  1283. pmd_huge_pte(mm, pmdp) = NULL;
  1284. else {
  1285. pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
  1286. list_del(lh);
  1287. }
  1288. ptep = (pte_t *) pgtable;
  1289. *ptep = __pte(0);
  1290. ptep++;
  1291. *ptep = __pte(0);
  1292. return pgtable;
  1293. }
  1294. pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
  1295. unsigned long addr, pmd_t *pmdp)
  1296. {
  1297. pmd_t old_pmd;
  1298. unsigned long old;
  1299. old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
  1300. old_pmd = __pmd(old);
  1301. return old_pmd;
  1302. }
  1303. pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
  1304. unsigned long addr, pud_t *pudp)
  1305. {
  1306. pud_t old_pud;
  1307. unsigned long old;
  1308. old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
  1309. old_pud = __pud(old);
  1310. return old_pud;
  1311. }
  1312. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  1313. void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
  1314. pte_t entry, unsigned long address, int psize)
  1315. {
  1316. struct mm_struct *mm = vma->vm_mm;
  1317. unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
  1318. _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
  1319. unsigned long change = pte_val(entry) ^ pte_val(*ptep);
  1320. /*
  1321. * On POWER9, the NMMU is not able to relax PTE access permissions
  1322. * for a translation with a TLB. The PTE must be invalidated, TLB
  1323. * flushed before the new PTE is installed.
  1324. *
  1325. * This only needs to be done for radix, because hash translation does
  1326. * flush when updating the linux pte (and we don't support NMMU
  1327. * accelerators on HPT on POWER9 anyway XXX: do we?).
  1328. *
  1329. * POWER10 (and P9P) NMMU does behave as per ISA.
  1330. */
  1331. if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
  1332. atomic_read(&mm->context.copros) > 0) {
  1333. unsigned long old_pte, new_pte;
  1334. old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
  1335. new_pte = old_pte | set;
  1336. radix__flush_tlb_page_psize(mm, address, psize);
  1337. __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
  1338. } else {
  1339. __radix_pte_update(ptep, 0, set);
  1340. /*
  1341. * Book3S does not require a TLB flush when relaxing access
  1342. * restrictions when the address space (modulo the POWER9 nest
  1343. * MMU issue above) because the MMU will reload the PTE after
  1344. * taking an access fault, as defined by the architecture. See
  1345. * "Setting a Reference or Change Bit or Upgrading Access
  1346. * Authority (PTE Subject to Atomic Hardware Updates)" in
  1347. * Power ISA Version 3.1B.
  1348. */
  1349. }
  1350. /* See ptesync comment in radix__set_pte_at */
  1351. }
  1352. void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
  1353. unsigned long addr, pte_t *ptep,
  1354. pte_t old_pte, pte_t pte)
  1355. {
  1356. struct mm_struct *mm = vma->vm_mm;
  1357. /*
  1358. * POWER9 NMMU must flush the TLB after clearing the PTE before
  1359. * installing a PTE with more relaxed access permissions, see
  1360. * radix__ptep_set_access_flags.
  1361. */
  1362. if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
  1363. is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
  1364. (atomic_read(&mm->context.copros) > 0))
  1365. radix__flush_tlb_page(vma, addr);
  1366. set_pte_at(mm, addr, ptep, pte);
  1367. }
  1368. int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
  1369. {
  1370. pte_t *ptep = (pte_t *)pud;
  1371. pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
  1372. if (!radix_enabled())
  1373. return 0;
  1374. set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
  1375. return 1;
  1376. }
  1377. int pud_clear_huge(pud_t *pud)
  1378. {
  1379. if (pud_leaf(*pud)) {
  1380. pud_clear(pud);
  1381. return 1;
  1382. }
  1383. return 0;
  1384. }
  1385. int pud_free_pmd_page(pud_t *pud, unsigned long addr)
  1386. {
  1387. pmd_t *pmd;
  1388. int i;
  1389. pmd = pud_pgtable(*pud);
  1390. pud_clear(pud);
  1391. flush_tlb_kernel_range(addr, addr + PUD_SIZE);
  1392. for (i = 0; i < PTRS_PER_PMD; i++) {
  1393. if (!pmd_none(pmd[i])) {
  1394. pte_t *pte;
  1395. pte = (pte_t *)pmd_page_vaddr(pmd[i]);
  1396. pte_free_kernel(&init_mm, pte);
  1397. }
  1398. }
  1399. pmd_free(&init_mm, pmd);
  1400. return 1;
  1401. }
  1402. int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
  1403. {
  1404. pte_t *ptep = (pte_t *)pmd;
  1405. pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
  1406. if (!radix_enabled())
  1407. return 0;
  1408. set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
  1409. return 1;
  1410. }
  1411. int pmd_clear_huge(pmd_t *pmd)
  1412. {
  1413. if (pmd_leaf(*pmd)) {
  1414. pmd_clear(pmd);
  1415. return 1;
  1416. }
  1417. return 0;
  1418. }
  1419. int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
  1420. {
  1421. pte_t *pte;
  1422. pte = (pte_t *)pmd_page_vaddr(*pmd);
  1423. pmd_clear(pmd);
  1424. flush_tlb_kernel_range(addr, addr + PMD_SIZE);
  1425. pte_free_kernel(&init_mm, pte);
  1426. return 1;
  1427. }