trans_pgd.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Transitional page tables for kexec and hibernate
  4. *
  5. * This file derived from: arch/arm64/kernel/hibernate.c
  6. *
  7. * Copyright (c) 2021, Microsoft Corporation.
  8. * Pasha Tatashin <pasha.tatashin@soleen.com>
  9. *
  10. */
  11. /*
  12. * Transitional tables are used during system transferring from one world to
  13. * another: such as during hibernate restore, and kexec reboots. During these
  14. * phases one cannot rely on page table not being overwritten. This is because
  15. * hibernate and kexec can overwrite the current page tables during transition.
  16. */
  17. #include <asm/trans_pgd.h>
  18. #include <asm/pgalloc.h>
  19. #include <asm/pgtable.h>
  20. #include <linux/suspend.h>
  21. #include <linux/bug.h>
  22. #include <linux/mm.h>
  23. #include <linux/mmzone.h>
  24. #include <linux/kfence.h>
  25. static void *trans_alloc(struct trans_pgd_info *info)
  26. {
  27. return info->trans_alloc_page(info->trans_alloc_arg);
  28. }
  29. static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
  30. {
  31. pte_t pte = __ptep_get(src_ptep);
  32. if (pte_valid(pte)) {
  33. /*
  34. * Resume will overwrite areas that may be marked
  35. * read only (code, rodata). Clear the RDONLY bit from
  36. * the temporary mappings we use during restore.
  37. */
  38. __set_pte(dst_ptep, pte_mkwrite_novma(pte));
  39. } else if (!pte_none(pte)) {
  40. /*
  41. * debug_pagealloc will removed the PTE_VALID bit if
  42. * the page isn't in use by the resume kernel. It may have
  43. * been in use by the original kernel, in which case we need
  44. * to put it back in our copy to do the restore.
  45. *
  46. * Other cases include kfence / vmalloc / memfd_secret which
  47. * may call `set_direct_map_invalid_noflush()`.
  48. *
  49. * Before marking this entry valid, check the pfn should
  50. * be mapped.
  51. */
  52. BUG_ON(!pfn_valid(pte_pfn(pte)));
  53. __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
  54. }
  55. }
  56. static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
  57. pmd_t *src_pmdp, unsigned long start, unsigned long end)
  58. {
  59. pte_t *src_ptep;
  60. pte_t *dst_ptep;
  61. unsigned long addr = start;
  62. dst_ptep = trans_alloc(info);
  63. if (!dst_ptep)
  64. return -ENOMEM;
  65. pmd_populate_kernel(NULL, dst_pmdp, dst_ptep);
  66. dst_ptep = pte_offset_kernel(dst_pmdp, start);
  67. src_ptep = pte_offset_kernel(src_pmdp, start);
  68. do {
  69. _copy_pte(dst_ptep, src_ptep, addr);
  70. } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
  71. return 0;
  72. }
  73. static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
  74. pud_t *src_pudp, unsigned long start, unsigned long end)
  75. {
  76. pmd_t *src_pmdp;
  77. pmd_t *dst_pmdp;
  78. unsigned long next;
  79. unsigned long addr = start;
  80. if (pud_none(READ_ONCE(*dst_pudp))) {
  81. dst_pmdp = trans_alloc(info);
  82. if (!dst_pmdp)
  83. return -ENOMEM;
  84. pud_populate(NULL, dst_pudp, dst_pmdp);
  85. }
  86. dst_pmdp = pmd_offset(dst_pudp, start);
  87. src_pmdp = pmd_offset(src_pudp, start);
  88. do {
  89. pmd_t pmd = READ_ONCE(*src_pmdp);
  90. next = pmd_addr_end(addr, end);
  91. if (pmd_none(pmd))
  92. continue;
  93. if (pmd_table(pmd)) {
  94. if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
  95. return -ENOMEM;
  96. } else {
  97. set_pmd(dst_pmdp,
  98. __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
  99. }
  100. } while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
  101. return 0;
  102. }
  103. static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
  104. p4d_t *src_p4dp, unsigned long start,
  105. unsigned long end)
  106. {
  107. pud_t *dst_pudp;
  108. pud_t *src_pudp;
  109. unsigned long next;
  110. unsigned long addr = start;
  111. if (p4d_none(READ_ONCE(*dst_p4dp))) {
  112. dst_pudp = trans_alloc(info);
  113. if (!dst_pudp)
  114. return -ENOMEM;
  115. p4d_populate(NULL, dst_p4dp, dst_pudp);
  116. }
  117. dst_pudp = pud_offset(dst_p4dp, start);
  118. src_pudp = pud_offset(src_p4dp, start);
  119. do {
  120. pud_t pud = READ_ONCE(*src_pudp);
  121. next = pud_addr_end(addr, end);
  122. if (pud_none(pud))
  123. continue;
  124. if (pud_table(pud)) {
  125. if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
  126. return -ENOMEM;
  127. } else {
  128. set_pud(dst_pudp,
  129. __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
  130. }
  131. } while (dst_pudp++, src_pudp++, addr = next, addr != end);
  132. return 0;
  133. }
  134. static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp,
  135. pgd_t *src_pgdp, unsigned long start,
  136. unsigned long end)
  137. {
  138. p4d_t *dst_p4dp;
  139. p4d_t *src_p4dp;
  140. unsigned long next;
  141. unsigned long addr = start;
  142. dst_p4dp = p4d_offset(dst_pgdp, start);
  143. src_p4dp = p4d_offset(src_pgdp, start);
  144. do {
  145. next = p4d_addr_end(addr, end);
  146. if (p4d_none(READ_ONCE(*src_p4dp)))
  147. continue;
  148. if (copy_pud(info, dst_p4dp, src_p4dp, addr, next))
  149. return -ENOMEM;
  150. } while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
  151. return 0;
  152. }
  153. static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp,
  154. unsigned long start, unsigned long end)
  155. {
  156. unsigned long next;
  157. unsigned long addr = start;
  158. pgd_t *src_pgdp = pgd_offset_k(start);
  159. dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
  160. do {
  161. next = pgd_addr_end(addr, end);
  162. if (pgd_none(READ_ONCE(*src_pgdp)))
  163. continue;
  164. if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next))
  165. return -ENOMEM;
  166. } while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
  167. return 0;
  168. }
  169. /*
  170. * Create trans_pgd and copy linear map.
  171. * info: contains allocator and its argument
  172. * dst_pgdp: new page table that is created, and to which map is copied.
  173. * start: Start of the interval (inclusive).
  174. * end: End of the interval (exclusive).
  175. *
  176. * Returns 0 on success, and -ENOMEM on failure.
  177. */
  178. int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp,
  179. unsigned long start, unsigned long end)
  180. {
  181. int rc;
  182. pgd_t *trans_pgd = trans_alloc(info);
  183. if (!trans_pgd) {
  184. pr_err("Failed to allocate memory for temporary page tables.\n");
  185. return -ENOMEM;
  186. }
  187. rc = copy_page_tables(info, trans_pgd, start, end);
  188. if (!rc)
  189. *dst_pgdp = trans_pgd;
  190. return rc;
  191. }
  192. /*
  193. * The page we want to idmap may be outside the range covered by VA_BITS that
  194. * can be built using the kernel's p?d_populate() helpers. As a one off, for a
  195. * single page, we build these page tables bottom up and just assume that will
  196. * need the maximum T0SZ.
  197. *
  198. * Returns 0 on success, and -ENOMEM on failure.
  199. * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to
  200. * maximum T0SZ for this page.
  201. */
  202. int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
  203. unsigned long *t0sz, void *page)
  204. {
  205. phys_addr_t dst_addr = virt_to_phys(page);
  206. unsigned long pfn = __phys_to_pfn(dst_addr);
  207. int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47;
  208. int bits_mapped = PAGE_SHIFT - 4;
  209. unsigned long level_mask, prev_level_entry, *levels[4];
  210. int this_level, index, level_lsb, level_msb;
  211. dst_addr &= PAGE_MASK;
  212. prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_ROX));
  213. for (this_level = 3; this_level >= 0; this_level--) {
  214. levels[this_level] = trans_alloc(info);
  215. if (!levels[this_level])
  216. return -ENOMEM;
  217. level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level);
  218. level_msb = min(level_lsb + bits_mapped, max_msb);
  219. level_mask = GENMASK_ULL(level_msb, level_lsb);
  220. index = (dst_addr & level_mask) >> level_lsb;
  221. *(levels[this_level] + index) = prev_level_entry;
  222. pfn = virt_to_pfn(levels[this_level]);
  223. prev_level_entry = pte_val(pfn_pte(pfn,
  224. __pgprot(PMD_TYPE_TABLE)));
  225. if (level_msb == max_msb)
  226. break;
  227. }
  228. *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn));
  229. *t0sz = TCR_T0SZ(max_msb + 1);
  230. return 0;
  231. }
  232. /*
  233. * Create a copy of the vector table so we can call HVC_SET_VECTORS or
  234. * HVC_SOFT_RESTART from contexts where the table may be overwritten.
  235. */
  236. int trans_pgd_copy_el2_vectors(struct trans_pgd_info *info,
  237. phys_addr_t *el2_vectors)
  238. {
  239. void *hyp_stub = trans_alloc(info);
  240. if (!hyp_stub)
  241. return -ENOMEM;
  242. *el2_vectors = virt_to_phys(hyp_stub);
  243. memcpy(hyp_stub, &trans_pgd_stub_vectors, ARM64_VECTOR_TABLE_LEN);
  244. caches_clean_inval_pou((unsigned long)hyp_stub,
  245. (unsigned long)hyp_stub +
  246. ARM64_VECTOR_TABLE_LEN);
  247. dcache_clean_inval_poc((unsigned long)hyp_stub,
  248. (unsigned long)hyp_stub +
  249. ARM64_VECTOR_TABLE_LEN);
  250. return 0;
  251. }