hugetlb_vmemmap.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * HugeTLB Vmemmap Optimization (HVO)
  4. *
  5. * Copyright (c) 2020, ByteDance. All rights reserved.
  6. *
  7. * Author: Muchun Song <songmuchun@bytedance.com>
  8. *
  9. * See Documentation/mm/vmemmap_dedup.rst
  10. */
  11. #define pr_fmt(fmt) "HugeTLB: " fmt
  12. #include <linux/pgtable.h>
  13. #include <linux/moduleparam.h>
  14. #include <linux/bootmem_info.h>
  15. #include <linux/mmdebug.h>
  16. #include <linux/pagewalk.h>
  17. #include <asm/pgalloc.h>
  18. #include <asm/tlbflush.h>
  19. #include "hugetlb_vmemmap.h"
  20. /**
  21. * struct vmemmap_remap_walk - walk vmemmap page table
  22. *
  23. * @remap_pte: called for each lowest-level entry (PTE).
  24. * @nr_walked: the number of walked pte.
  25. * @reuse_page: the page which is reused for the tail vmemmap pages.
  26. * @reuse_addr: the virtual address of the @reuse_page page.
  27. * @vmemmap_pages: the list head of the vmemmap pages that can be freed
  28. * or is mapped from.
  29. * @flags: used to modify behavior in vmemmap page table walking
  30. * operations.
  31. */
  32. struct vmemmap_remap_walk {
  33. void (*remap_pte)(pte_t *pte, unsigned long addr,
  34. struct vmemmap_remap_walk *walk);
  35. unsigned long nr_walked;
  36. struct page *reuse_page;
  37. unsigned long reuse_addr;
  38. struct list_head *vmemmap_pages;
  39. /* Skip the TLB flush when we split the PMD */
  40. #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
  41. /* Skip the TLB flush when we remap the PTE */
  42. #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
  43. /* synchronize_rcu() to avoid writes from page_ref_add_unless() */
  44. #define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
  45. unsigned long flags;
  46. };
  47. static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  48. struct vmemmap_remap_walk *walk)
  49. {
  50. pmd_t __pmd;
  51. int i;
  52. unsigned long addr = start;
  53. pte_t *pgtable;
  54. pgtable = pte_alloc_one_kernel(&init_mm);
  55. if (!pgtable)
  56. return -ENOMEM;
  57. pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  58. for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  59. pte_t entry, *pte;
  60. pgprot_t pgprot = PAGE_KERNEL;
  61. entry = mk_pte(head + i, pgprot);
  62. pte = pte_offset_kernel(&__pmd, addr);
  63. set_pte_at(&init_mm, addr, pte, entry);
  64. }
  65. spin_lock(&init_mm.page_table_lock);
  66. if (likely(pmd_leaf(*pmd))) {
  67. /*
  68. * Higher order allocations from buddy allocator must be able to
  69. * be treated as indepdenent small pages (as they can be freed
  70. * individually).
  71. */
  72. if (!PageReserved(head))
  73. split_page(head, get_order(PMD_SIZE));
  74. /* Make pte visible before pmd. See comment in pmd_install(). */
  75. smp_wmb();
  76. pmd_populate_kernel(&init_mm, pmd, pgtable);
  77. if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  78. flush_tlb_kernel_range(start, start + PMD_SIZE);
  79. } else {
  80. pte_free_kernel(&init_mm, pgtable);
  81. }
  82. spin_unlock(&init_mm.page_table_lock);
  83. return 0;
  84. }
  85. static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  86. unsigned long next, struct mm_walk *walk)
  87. {
  88. int ret = 0;
  89. struct page *head;
  90. struct vmemmap_remap_walk *vmemmap_walk = walk->private;
  91. /* Only splitting, not remapping the vmemmap pages. */
  92. if (!vmemmap_walk->remap_pte)
  93. walk->action = ACTION_CONTINUE;
  94. spin_lock(&init_mm.page_table_lock);
  95. head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
  96. /*
  97. * Due to HugeTLB alignment requirements and the vmemmap
  98. * pages being at the start of the hotplugged memory
  99. * region in memory_hotplug.memmap_on_memory case. Checking
  100. * the vmemmap page associated with the first vmemmap page
  101. * if it is self-hosted is sufficient.
  102. *
  103. * [ hotplugged memory ]
  104. * [ section ][...][ section ]
  105. * [ vmemmap ][ usable memory ]
  106. * ^ | ^ |
  107. * +--+ | |
  108. * +------------------------+
  109. */
  110. if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
  111. struct page *page = head ? head + pte_index(addr) :
  112. pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
  113. if (PageVmemmapSelfHosted(page))
  114. ret = -ENOTSUPP;
  115. }
  116. spin_unlock(&init_mm.page_table_lock);
  117. if (!head || ret)
  118. return ret;
  119. return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
  120. }
  121. static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
  122. unsigned long next, struct mm_walk *walk)
  123. {
  124. struct vmemmap_remap_walk *vmemmap_walk = walk->private;
  125. /*
  126. * The reuse_page is found 'first' in page table walking before
  127. * starting remapping.
  128. */
  129. if (!vmemmap_walk->reuse_page)
  130. vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
  131. else
  132. vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
  133. vmemmap_walk->nr_walked++;
  134. return 0;
  135. }
  136. static const struct mm_walk_ops vmemmap_remap_ops = {
  137. .pmd_entry = vmemmap_pmd_entry,
  138. .pte_entry = vmemmap_pte_entry,
  139. };
  140. static int vmemmap_remap_range(unsigned long start, unsigned long end,
  141. struct vmemmap_remap_walk *walk)
  142. {
  143. int ret;
  144. VM_BUG_ON(!PAGE_ALIGNED(start | end));
  145. mmap_read_lock(&init_mm);
  146. ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
  147. NULL, walk);
  148. mmap_read_unlock(&init_mm);
  149. if (ret)
  150. return ret;
  151. if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
  152. flush_tlb_kernel_range(start, end);
  153. return 0;
  154. }
  155. /*
  156. * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  157. * allocator or buddy allocator. If the PG_reserved flag is set, it means
  158. * that it allocated from the memblock allocator, just free it via the
  159. * free_bootmem_page(). Otherwise, use __free_page().
  160. */
  161. static inline void free_vmemmap_page(struct page *page)
  162. {
  163. if (PageReserved(page)) {
  164. memmap_boot_pages_add(-1);
  165. free_bootmem_page(page);
  166. } else {
  167. memmap_pages_add(-1);
  168. __free_page(page);
  169. }
  170. }
  171. /* Free a list of the vmemmap pages */
  172. static void free_vmemmap_page_list(struct list_head *list)
  173. {
  174. struct page *page, *next;
  175. list_for_each_entry_safe(page, next, list, lru)
  176. free_vmemmap_page(page);
  177. }
  178. static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
  179. struct vmemmap_remap_walk *walk)
  180. {
  181. /*
  182. * Remap the tail pages as read-only to catch illegal write operation
  183. * to the tail pages.
  184. */
  185. pgprot_t pgprot = PAGE_KERNEL_RO;
  186. struct page *page = pte_page(ptep_get(pte));
  187. pte_t entry;
  188. /* Remapping the head page requires r/w */
  189. if (unlikely(addr == walk->reuse_addr)) {
  190. pgprot = PAGE_KERNEL;
  191. list_del(&walk->reuse_page->lru);
  192. /*
  193. * Makes sure that preceding stores to the page contents from
  194. * vmemmap_remap_free() become visible before the set_pte_at()
  195. * write.
  196. */
  197. smp_wmb();
  198. }
  199. entry = mk_pte(walk->reuse_page, pgprot);
  200. list_add(&page->lru, walk->vmemmap_pages);
  201. set_pte_at(&init_mm, addr, pte, entry);
  202. }
  203. /*
  204. * How many struct page structs need to be reset. When we reuse the head
  205. * struct page, the special metadata (e.g. page->flags or page->mapping)
  206. * cannot copy to the tail struct page structs. The invalid value will be
  207. * checked in the free_tail_page_prepare(). In order to avoid the message
  208. * of "corrupted mapping in tail page". We need to reset at least 3 (one
  209. * head struct page struct and two tail struct page structs) struct page
  210. * structs.
  211. */
  212. #define NR_RESET_STRUCT_PAGE 3
  213. static inline void reset_struct_pages(struct page *start)
  214. {
  215. struct page *from = start + NR_RESET_STRUCT_PAGE;
  216. BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
  217. memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
  218. }
  219. static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
  220. struct vmemmap_remap_walk *walk)
  221. {
  222. pgprot_t pgprot = PAGE_KERNEL;
  223. struct page *page;
  224. void *to;
  225. BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
  226. page = list_first_entry(walk->vmemmap_pages, struct page, lru);
  227. list_del(&page->lru);
  228. to = page_to_virt(page);
  229. copy_page(to, (void *)walk->reuse_addr);
  230. reset_struct_pages(to);
  231. /*
  232. * Makes sure that preceding stores to the page contents become visible
  233. * before the set_pte_at() write.
  234. */
  235. smp_wmb();
  236. set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
  237. }
  238. /**
  239. * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
  240. * backing PMDs of the directmap into PTEs
  241. * @start: start address of the vmemmap virtual address range that we want
  242. * to remap.
  243. * @end: end address of the vmemmap virtual address range that we want to
  244. * remap.
  245. * @reuse: reuse address.
  246. *
  247. * Return: %0 on success, negative error code otherwise.
  248. */
  249. static int vmemmap_remap_split(unsigned long start, unsigned long end,
  250. unsigned long reuse)
  251. {
  252. struct vmemmap_remap_walk walk = {
  253. .remap_pte = NULL,
  254. .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
  255. };
  256. /* See the comment in the vmemmap_remap_free(). */
  257. BUG_ON(start - reuse != PAGE_SIZE);
  258. return vmemmap_remap_range(reuse, end, &walk);
  259. }
  260. /**
  261. * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  262. * to the page which @reuse is mapped to, then free vmemmap
  263. * which the range are mapped to.
  264. * @start: start address of the vmemmap virtual address range that we want
  265. * to remap.
  266. * @end: end address of the vmemmap virtual address range that we want to
  267. * remap.
  268. * @reuse: reuse address.
  269. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
  270. * responsibility to free pages.
  271. * @flags: modifications to vmemmap_remap_walk flags
  272. *
  273. * Return: %0 on success, negative error code otherwise.
  274. */
  275. static int vmemmap_remap_free(unsigned long start, unsigned long end,
  276. unsigned long reuse,
  277. struct list_head *vmemmap_pages,
  278. unsigned long flags)
  279. {
  280. int ret;
  281. struct vmemmap_remap_walk walk = {
  282. .remap_pte = vmemmap_remap_pte,
  283. .reuse_addr = reuse,
  284. .vmemmap_pages = vmemmap_pages,
  285. .flags = flags,
  286. };
  287. int nid = page_to_nid((struct page *)reuse);
  288. gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
  289. /*
  290. * Allocate a new head vmemmap page to avoid breaking a contiguous
  291. * block of struct page memory when freeing it back to page allocator
  292. * in free_vmemmap_page_list(). This will allow the likely contiguous
  293. * struct page backing memory to be kept contiguous and allowing for
  294. * more allocations of hugepages. Fallback to the currently
  295. * mapped head page in case should it fail to allocate.
  296. */
  297. walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
  298. if (walk.reuse_page) {
  299. copy_page(page_to_virt(walk.reuse_page),
  300. (void *)walk.reuse_addr);
  301. list_add(&walk.reuse_page->lru, vmemmap_pages);
  302. memmap_pages_add(1);
  303. }
  304. /*
  305. * In order to make remapping routine most efficient for the huge pages,
  306. * the routine of vmemmap page table walking has the following rules
  307. * (see more details from the vmemmap_pte_range()):
  308. *
  309. * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
  310. * should be continuous.
  311. * - The @reuse address is part of the range [@reuse, @end) that we are
  312. * walking which is passed to vmemmap_remap_range().
  313. * - The @reuse address is the first in the complete range.
  314. *
  315. * So we need to make sure that @start and @reuse meet the above rules.
  316. */
  317. BUG_ON(start - reuse != PAGE_SIZE);
  318. ret = vmemmap_remap_range(reuse, end, &walk);
  319. if (ret && walk.nr_walked) {
  320. end = reuse + walk.nr_walked * PAGE_SIZE;
  321. /*
  322. * vmemmap_pages contains pages from the previous
  323. * vmemmap_remap_range call which failed. These
  324. * are pages which were removed from the vmemmap.
  325. * They will be restored in the following call.
  326. */
  327. walk = (struct vmemmap_remap_walk) {
  328. .remap_pte = vmemmap_restore_pte,
  329. .reuse_addr = reuse,
  330. .vmemmap_pages = vmemmap_pages,
  331. .flags = 0,
  332. };
  333. vmemmap_remap_range(reuse, end, &walk);
  334. }
  335. return ret;
  336. }
  337. static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
  338. struct list_head *list)
  339. {
  340. gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
  341. unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
  342. int nid = page_to_nid((struct page *)start);
  343. struct page *page, *next;
  344. int i;
  345. for (i = 0; i < nr_pages; i++) {
  346. page = alloc_pages_node(nid, gfp_mask, 0);
  347. if (!page)
  348. goto out;
  349. list_add(&page->lru, list);
  350. }
  351. memmap_pages_add(nr_pages);
  352. return 0;
  353. out:
  354. list_for_each_entry_safe(page, next, list, lru)
  355. __free_page(page);
  356. return -ENOMEM;
  357. }
  358. /**
  359. * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
  360. * to the page which is from the @vmemmap_pages
  361. * respectively.
  362. * @start: start address of the vmemmap virtual address range that we want
  363. * to remap.
  364. * @end: end address of the vmemmap virtual address range that we want to
  365. * remap.
  366. * @reuse: reuse address.
  367. * @flags: modifications to vmemmap_remap_walk flags
  368. *
  369. * Return: %0 on success, negative error code otherwise.
  370. */
  371. static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
  372. unsigned long reuse, unsigned long flags)
  373. {
  374. LIST_HEAD(vmemmap_pages);
  375. struct vmemmap_remap_walk walk = {
  376. .remap_pte = vmemmap_restore_pte,
  377. .reuse_addr = reuse,
  378. .vmemmap_pages = &vmemmap_pages,
  379. .flags = flags,
  380. };
  381. /* See the comment in the vmemmap_remap_free(). */
  382. BUG_ON(start - reuse != PAGE_SIZE);
  383. if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
  384. return -ENOMEM;
  385. return vmemmap_remap_range(reuse, end, &walk);
  386. }
  387. DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
  388. EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
  389. static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
  390. core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
  391. static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
  392. struct folio *folio, unsigned long flags)
  393. {
  394. int ret;
  395. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  396. unsigned long vmemmap_reuse;
  397. VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
  398. VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
  399. if (!folio_test_hugetlb_vmemmap_optimized(folio))
  400. return 0;
  401. if (flags & VMEMMAP_SYNCHRONIZE_RCU)
  402. synchronize_rcu();
  403. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  404. vmemmap_reuse = vmemmap_start;
  405. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  406. /*
  407. * The pages which the vmemmap virtual address range [@vmemmap_start,
  408. * @vmemmap_end) are mapped to are freed to the buddy allocator, and
  409. * the range is mapped to the page which @vmemmap_reuse is mapped to.
  410. * When a HugeTLB page is freed to the buddy allocator, previously
  411. * discarded vmemmap pages must be allocated and remapping.
  412. */
  413. ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
  414. if (!ret) {
  415. folio_clear_hugetlb_vmemmap_optimized(folio);
  416. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  417. }
  418. return ret;
  419. }
  420. /**
  421. * hugetlb_vmemmap_restore_folio - restore previously optimized (by
  422. * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
  423. * will be reallocated and remapped.
  424. * @h: struct hstate.
  425. * @folio: the folio whose vmemmap pages will be restored.
  426. *
  427. * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
  428. * negative error code otherwise.
  429. */
  430. int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
  431. {
  432. return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
  433. }
  434. /**
  435. * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
  436. * @h: hstate.
  437. * @folio_list: list of folios.
  438. * @non_hvo_folios: Output list of folios for which vmemmap exists.
  439. *
  440. * Return: number of folios for which vmemmap was restored, or an error code
  441. * if an error was encountered restoring vmemmap for a folio.
  442. * Folios that have vmemmap are moved to the non_hvo_folios
  443. * list. Processing of entries stops when the first error is
  444. * encountered. The folio that experienced the error and all
  445. * non-processed folios will remain on folio_list.
  446. */
  447. long hugetlb_vmemmap_restore_folios(const struct hstate *h,
  448. struct list_head *folio_list,
  449. struct list_head *non_hvo_folios)
  450. {
  451. struct folio *folio, *t_folio;
  452. long restored = 0;
  453. long ret = 0;
  454. unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
  455. list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
  456. if (folio_test_hugetlb_vmemmap_optimized(folio)) {
  457. ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
  458. /* only need to synchronize_rcu() once for each batch */
  459. flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
  460. if (ret)
  461. break;
  462. restored++;
  463. }
  464. /* Add non-optimized folios to output list */
  465. list_move(&folio->lru, non_hvo_folios);
  466. }
  467. if (restored)
  468. flush_tlb_all();
  469. if (!ret)
  470. ret = restored;
  471. return ret;
  472. }
  473. /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
  474. static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
  475. {
  476. if (folio_test_hugetlb_vmemmap_optimized(folio))
  477. return false;
  478. if (!READ_ONCE(vmemmap_optimize_enabled))
  479. return false;
  480. if (!hugetlb_vmemmap_optimizable(h))
  481. return false;
  482. return true;
  483. }
  484. static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
  485. struct folio *folio,
  486. struct list_head *vmemmap_pages,
  487. unsigned long flags)
  488. {
  489. int ret = 0;
  490. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  491. unsigned long vmemmap_reuse;
  492. VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
  493. VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
  494. if (!vmemmap_should_optimize_folio(h, folio))
  495. return ret;
  496. static_branch_inc(&hugetlb_optimize_vmemmap_key);
  497. if (flags & VMEMMAP_SYNCHRONIZE_RCU)
  498. synchronize_rcu();
  499. /*
  500. * Very Subtle
  501. * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
  502. * immediately after remapping. As a result, subsequent accesses
  503. * and modifications to struct pages associated with the hugetlb
  504. * page could be to the OLD struct pages. Set the vmemmap optimized
  505. * flag here so that it is copied to the new head page. This keeps
  506. * the old and new struct pages in sync.
  507. * If there is an error during optimization, we will immediately FLUSH
  508. * the TLB and clear the flag below.
  509. */
  510. folio_set_hugetlb_vmemmap_optimized(folio);
  511. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  512. vmemmap_reuse = vmemmap_start;
  513. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  514. /*
  515. * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
  516. * to the page which @vmemmap_reuse is mapped to. Add pages previously
  517. * mapping the range to vmemmap_pages list so that they can be freed by
  518. * the caller.
  519. */
  520. ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
  521. vmemmap_pages, flags);
  522. if (ret) {
  523. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  524. folio_clear_hugetlb_vmemmap_optimized(folio);
  525. }
  526. return ret;
  527. }
  528. /**
  529. * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
  530. * @h: struct hstate.
  531. * @folio: the folio whose vmemmap pages will be optimized.
  532. *
  533. * This function only tries to optimize @folio's vmemmap pages and does not
  534. * guarantee that the optimization will succeed after it returns. The caller
  535. * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
  536. * vmemmap pages have been optimized.
  537. */
  538. void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
  539. {
  540. LIST_HEAD(vmemmap_pages);
  541. __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
  542. free_vmemmap_page_list(&vmemmap_pages);
  543. }
  544. static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
  545. {
  546. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  547. unsigned long vmemmap_reuse;
  548. if (!vmemmap_should_optimize_folio(h, folio))
  549. return 0;
  550. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  551. vmemmap_reuse = vmemmap_start;
  552. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  553. /*
  554. * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
  555. * @vmemmap_end]
  556. */
  557. return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
  558. }
  559. void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
  560. {
  561. struct folio *folio;
  562. LIST_HEAD(vmemmap_pages);
  563. unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
  564. list_for_each_entry(folio, folio_list, lru) {
  565. int ret = hugetlb_vmemmap_split_folio(h, folio);
  566. /*
  567. * Spliting the PMD requires allocating a page, thus lets fail
  568. * early once we encounter the first OOM. No point in retrying
  569. * as it can be dynamically done on remap with the memory
  570. * we get back from the vmemmap deduplication.
  571. */
  572. if (ret == -ENOMEM)
  573. break;
  574. }
  575. flush_tlb_all();
  576. list_for_each_entry(folio, folio_list, lru) {
  577. int ret;
  578. ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
  579. /* only need to synchronize_rcu() once for each batch */
  580. flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
  581. /*
  582. * Pages to be freed may have been accumulated. If we
  583. * encounter an ENOMEM, free what we have and try again.
  584. * This can occur in the case that both spliting fails
  585. * halfway and head page allocation also failed. In this
  586. * case __hugetlb_vmemmap_optimize_folio() would free memory
  587. * allowing more vmemmap remaps to occur.
  588. */
  589. if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
  590. flush_tlb_all();
  591. free_vmemmap_page_list(&vmemmap_pages);
  592. INIT_LIST_HEAD(&vmemmap_pages);
  593. __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
  594. }
  595. }
  596. flush_tlb_all();
  597. free_vmemmap_page_list(&vmemmap_pages);
  598. }
  599. static struct ctl_table hugetlb_vmemmap_sysctls[] = {
  600. {
  601. .procname = "hugetlb_optimize_vmemmap",
  602. .data = &vmemmap_optimize_enabled,
  603. .maxlen = sizeof(vmemmap_optimize_enabled),
  604. .mode = 0644,
  605. .proc_handler = proc_dobool,
  606. },
  607. };
  608. static int __init hugetlb_vmemmap_init(void)
  609. {
  610. const struct hstate *h;
  611. /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
  612. BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
  613. for_each_hstate(h) {
  614. if (hugetlb_vmemmap_optimizable(h)) {
  615. register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
  616. break;
  617. }
  618. }
  619. return 0;
  620. }
  621. late_initcall(hugetlb_vmemmap_init);