| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * HugeTLB Vmemmap Optimization (HVO)
- *
- * Copyright (c) 2020, ByteDance. All rights reserved.
- *
- * Author: Muchun Song <songmuchun@bytedance.com>
- *
- * See Documentation/mm/vmemmap_dedup.rst
- */
- #define pr_fmt(fmt) "HugeTLB: " fmt
- #include <linux/pgtable.h>
- #include <linux/moduleparam.h>
- #include <linux/bootmem_info.h>
- #include <linux/mmdebug.h>
- #include <linux/pagewalk.h>
- #include <asm/pgalloc.h>
- #include <asm/tlbflush.h>
- #include "hugetlb_vmemmap.h"
- /**
- * struct vmemmap_remap_walk - walk vmemmap page table
- *
- * @remap_pte: called for each lowest-level entry (PTE).
- * @nr_walked: the number of walked pte.
- * @reuse_page: the page which is reused for the tail vmemmap pages.
- * @reuse_addr: the virtual address of the @reuse_page page.
- * @vmemmap_pages: the list head of the vmemmap pages that can be freed
- * or is mapped from.
- * @flags: used to modify behavior in vmemmap page table walking
- * operations.
- */
- struct vmemmap_remap_walk {
- void (*remap_pte)(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk);
- unsigned long nr_walked;
- struct page *reuse_page;
- unsigned long reuse_addr;
- struct list_head *vmemmap_pages;
- /* Skip the TLB flush when we split the PMD */
- #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
- /* Skip the TLB flush when we remap the PTE */
- #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
- /* synchronize_rcu() to avoid writes from page_ref_add_unless() */
- #define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
- unsigned long flags;
- };
- static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
- struct vmemmap_remap_walk *walk)
- {
- pmd_t __pmd;
- int i;
- unsigned long addr = start;
- pte_t *pgtable;
- pgtable = pte_alloc_one_kernel(&init_mm);
- if (!pgtable)
- return -ENOMEM;
- pmd_populate_kernel(&init_mm, &__pmd, pgtable);
- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
- pgprot_t pgprot = PAGE_KERNEL;
- entry = mk_pte(head + i, pgprot);
- pte = pte_offset_kernel(&__pmd, addr);
- set_pte_at(&init_mm, addr, pte, entry);
- }
- spin_lock(&init_mm.page_table_lock);
- if (likely(pmd_leaf(*pmd))) {
- /*
- * Higher order allocations from buddy allocator must be able to
- * be treated as indepdenent small pages (as they can be freed
- * individually).
- */
- if (!PageReserved(head))
- split_page(head, get_order(PMD_SIZE));
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
- if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
- flush_tlb_kernel_range(start, start + PMD_SIZE);
- } else {
- pte_free_kernel(&init_mm, pgtable);
- }
- spin_unlock(&init_mm.page_table_lock);
- return 0;
- }
- static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
- {
- int ret = 0;
- struct page *head;
- struct vmemmap_remap_walk *vmemmap_walk = walk->private;
- /* Only splitting, not remapping the vmemmap pages. */
- if (!vmemmap_walk->remap_pte)
- walk->action = ACTION_CONTINUE;
- spin_lock(&init_mm.page_table_lock);
- head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
- /*
- * Due to HugeTLB alignment requirements and the vmemmap
- * pages being at the start of the hotplugged memory
- * region in memory_hotplug.memmap_on_memory case. Checking
- * the vmemmap page associated with the first vmemmap page
- * if it is self-hosted is sufficient.
- *
- * [ hotplugged memory ]
- * [ section ][...][ section ]
- * [ vmemmap ][ usable memory ]
- * ^ | ^ |
- * +--+ | |
- * +------------------------+
- */
- if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
- struct page *page = head ? head + pte_index(addr) :
- pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
- if (PageVmemmapSelfHosted(page))
- ret = -ENOTSUPP;
- }
- spin_unlock(&init_mm.page_table_lock);
- if (!head || ret)
- return ret;
- return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
- }
- static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
- {
- struct vmemmap_remap_walk *vmemmap_walk = walk->private;
- /*
- * The reuse_page is found 'first' in page table walking before
- * starting remapping.
- */
- if (!vmemmap_walk->reuse_page)
- vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
- else
- vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
- vmemmap_walk->nr_walked++;
- return 0;
- }
- static const struct mm_walk_ops vmemmap_remap_ops = {
- .pmd_entry = vmemmap_pmd_entry,
- .pte_entry = vmemmap_pte_entry,
- };
- static int vmemmap_remap_range(unsigned long start, unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- int ret;
- VM_BUG_ON(!PAGE_ALIGNED(start | end));
- mmap_read_lock(&init_mm);
- ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
- NULL, walk);
- mmap_read_unlock(&init_mm);
- if (ret)
- return ret;
- if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
- flush_tlb_kernel_range(start, end);
- return 0;
- }
- /*
- * Free a vmemmap page. A vmemmap page can be allocated from the memblock
- * allocator or buddy allocator. If the PG_reserved flag is set, it means
- * that it allocated from the memblock allocator, just free it via the
- * free_bootmem_page(). Otherwise, use __free_page().
- */
- static inline void free_vmemmap_page(struct page *page)
- {
- if (PageReserved(page)) {
- memmap_boot_pages_add(-1);
- free_bootmem_page(page);
- } else {
- memmap_pages_add(-1);
- __free_page(page);
- }
- }
- /* Free a list of the vmemmap pages */
- static void free_vmemmap_page_list(struct list_head *list)
- {
- struct page *page, *next;
- list_for_each_entry_safe(page, next, list, lru)
- free_vmemmap_page(page);
- }
- static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
- {
- /*
- * Remap the tail pages as read-only to catch illegal write operation
- * to the tail pages.
- */
- pgprot_t pgprot = PAGE_KERNEL_RO;
- struct page *page = pte_page(ptep_get(pte));
- pte_t entry;
- /* Remapping the head page requires r/w */
- if (unlikely(addr == walk->reuse_addr)) {
- pgprot = PAGE_KERNEL;
- list_del(&walk->reuse_page->lru);
- /*
- * Makes sure that preceding stores to the page contents from
- * vmemmap_remap_free() become visible before the set_pte_at()
- * write.
- */
- smp_wmb();
- }
- entry = mk_pte(walk->reuse_page, pgprot);
- list_add(&page->lru, walk->vmemmap_pages);
- set_pte_at(&init_mm, addr, pte, entry);
- }
- /*
- * How many struct page structs need to be reset. When we reuse the head
- * struct page, the special metadata (e.g. page->flags or page->mapping)
- * cannot copy to the tail struct page structs. The invalid value will be
- * checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
- * structs.
- */
- #define NR_RESET_STRUCT_PAGE 3
- static inline void reset_struct_pages(struct page *start)
- {
- struct page *from = start + NR_RESET_STRUCT_PAGE;
- BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
- memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
- }
- static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
- {
- pgprot_t pgprot = PAGE_KERNEL;
- struct page *page;
- void *to;
- BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
- page = list_first_entry(walk->vmemmap_pages, struct page, lru);
- list_del(&page->lru);
- to = page_to_virt(page);
- copy_page(to, (void *)walk->reuse_addr);
- reset_struct_pages(to);
- /*
- * Makes sure that preceding stores to the page contents become visible
- * before the set_pte_at() write.
- */
- smp_wmb();
- set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
- }
- /**
- * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
- * backing PMDs of the directmap into PTEs
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- *
- * Return: %0 on success, negative error code otherwise.
- */
- static int vmemmap_remap_split(unsigned long start, unsigned long end,
- unsigned long reuse)
- {
- struct vmemmap_remap_walk walk = {
- .remap_pte = NULL,
- .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
- };
- /* See the comment in the vmemmap_remap_free(). */
- BUG_ON(start - reuse != PAGE_SIZE);
- return vmemmap_remap_range(reuse, end, &walk);
- }
- /**
- * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
- * to the page which @reuse is mapped to, then free vmemmap
- * which the range are mapped to.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
- * responsibility to free pages.
- * @flags: modifications to vmemmap_remap_walk flags
- *
- * Return: %0 on success, negative error code otherwise.
- */
- static int vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse,
- struct list_head *vmemmap_pages,
- unsigned long flags)
- {
- int ret;
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_remap_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = vmemmap_pages,
- .flags = flags,
- };
- int nid = page_to_nid((struct page *)reuse);
- gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
- /*
- * Allocate a new head vmemmap page to avoid breaking a contiguous
- * block of struct page memory when freeing it back to page allocator
- * in free_vmemmap_page_list(). This will allow the likely contiguous
- * struct page backing memory to be kept contiguous and allowing for
- * more allocations of hugepages. Fallback to the currently
- * mapped head page in case should it fail to allocate.
- */
- walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
- if (walk.reuse_page) {
- copy_page(page_to_virt(walk.reuse_page),
- (void *)walk.reuse_addr);
- list_add(&walk.reuse_page->lru, vmemmap_pages);
- memmap_pages_add(1);
- }
- /*
- * In order to make remapping routine most efficient for the huge pages,
- * the routine of vmemmap page table walking has the following rules
- * (see more details from the vmemmap_pte_range()):
- *
- * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
- * should be continuous.
- * - The @reuse address is part of the range [@reuse, @end) that we are
- * walking which is passed to vmemmap_remap_range().
- * - The @reuse address is the first in the complete range.
- *
- * So we need to make sure that @start and @reuse meet the above rules.
- */
- BUG_ON(start - reuse != PAGE_SIZE);
- ret = vmemmap_remap_range(reuse, end, &walk);
- if (ret && walk.nr_walked) {
- end = reuse + walk.nr_walked * PAGE_SIZE;
- /*
- * vmemmap_pages contains pages from the previous
- * vmemmap_remap_range call which failed. These
- * are pages which were removed from the vmemmap.
- * They will be restored in the following call.
- */
- walk = (struct vmemmap_remap_walk) {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = vmemmap_pages,
- .flags = 0,
- };
- vmemmap_remap_range(reuse, end, &walk);
- }
- return ret;
- }
- static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- struct list_head *list)
- {
- gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
- unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
- int nid = page_to_nid((struct page *)start);
- struct page *page, *next;
- int i;
- for (i = 0; i < nr_pages; i++) {
- page = alloc_pages_node(nid, gfp_mask, 0);
- if (!page)
- goto out;
- list_add(&page->lru, list);
- }
- memmap_pages_add(nr_pages);
- return 0;
- out:
- list_for_each_entry_safe(page, next, list, lru)
- __free_page(page);
- return -ENOMEM;
- }
- /**
- * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
- * to the page which is from the @vmemmap_pages
- * respectively.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- * @flags: modifications to vmemmap_remap_walk flags
- *
- * Return: %0 on success, negative error code otherwise.
- */
- static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, unsigned long flags)
- {
- LIST_HEAD(vmemmap_pages);
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- .flags = flags,
- };
- /* See the comment in the vmemmap_remap_free(). */
- BUG_ON(start - reuse != PAGE_SIZE);
- if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
- return -ENOMEM;
- return vmemmap_remap_range(reuse, end, &walk);
- }
- DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
- EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
- static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
- core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
- static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
- struct folio *folio, unsigned long flags)
- {
- int ret;
- unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
- unsigned long vmemmap_reuse;
- VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
- VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
- if (!folio_test_hugetlb_vmemmap_optimized(folio))
- return 0;
- if (flags & VMEMMAP_SYNCHRONIZE_RCU)
- synchronize_rcu();
- vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
- vmemmap_reuse = vmemmap_start;
- vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
- /*
- * The pages which the vmemmap virtual address range [@vmemmap_start,
- * @vmemmap_end) are mapped to are freed to the buddy allocator, and
- * the range is mapped to the page which @vmemmap_reuse is mapped to.
- * When a HugeTLB page is freed to the buddy allocator, previously
- * discarded vmemmap pages must be allocated and remapping.
- */
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
- if (!ret) {
- folio_clear_hugetlb_vmemmap_optimized(folio);
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- }
- return ret;
- }
- /**
- * hugetlb_vmemmap_restore_folio - restore previously optimized (by
- * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
- * will be reallocated and remapped.
- * @h: struct hstate.
- * @folio: the folio whose vmemmap pages will be restored.
- *
- * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
- * negative error code otherwise.
- */
- int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
- {
- return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
- }
- /**
- * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
- * @h: hstate.
- * @folio_list: list of folios.
- * @non_hvo_folios: Output list of folios for which vmemmap exists.
- *
- * Return: number of folios for which vmemmap was restored, or an error code
- * if an error was encountered restoring vmemmap for a folio.
- * Folios that have vmemmap are moved to the non_hvo_folios
- * list. Processing of entries stops when the first error is
- * encountered. The folio that experienced the error and all
- * non-processed folios will remain on folio_list.
- */
- long hugetlb_vmemmap_restore_folios(const struct hstate *h,
- struct list_head *folio_list,
- struct list_head *non_hvo_folios)
- {
- struct folio *folio, *t_folio;
- long restored = 0;
- long ret = 0;
- unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
- list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
- if (folio_test_hugetlb_vmemmap_optimized(folio)) {
- ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
- /* only need to synchronize_rcu() once for each batch */
- flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
- if (ret)
- break;
- restored++;
- }
- /* Add non-optimized folios to output list */
- list_move(&folio->lru, non_hvo_folios);
- }
- if (restored)
- flush_tlb_all();
- if (!ret)
- ret = restored;
- return ret;
- }
- /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
- static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
- {
- if (folio_test_hugetlb_vmemmap_optimized(folio))
- return false;
- if (!READ_ONCE(vmemmap_optimize_enabled))
- return false;
- if (!hugetlb_vmemmap_optimizable(h))
- return false;
- return true;
- }
- static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
- struct folio *folio,
- struct list_head *vmemmap_pages,
- unsigned long flags)
- {
- int ret = 0;
- unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
- unsigned long vmemmap_reuse;
- VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
- VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
- if (!vmemmap_should_optimize_folio(h, folio))
- return ret;
- static_branch_inc(&hugetlb_optimize_vmemmap_key);
- if (flags & VMEMMAP_SYNCHRONIZE_RCU)
- synchronize_rcu();
- /*
- * Very Subtle
- * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
- * immediately after remapping. As a result, subsequent accesses
- * and modifications to struct pages associated with the hugetlb
- * page could be to the OLD struct pages. Set the vmemmap optimized
- * flag here so that it is copied to the new head page. This keeps
- * the old and new struct pages in sync.
- * If there is an error during optimization, we will immediately FLUSH
- * the TLB and clear the flag below.
- */
- folio_set_hugetlb_vmemmap_optimized(folio);
- vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
- vmemmap_reuse = vmemmap_start;
- vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
- /*
- * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
- * to the page which @vmemmap_reuse is mapped to. Add pages previously
- * mapping the range to vmemmap_pages list so that they can be freed by
- * the caller.
- */
- ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
- vmemmap_pages, flags);
- if (ret) {
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- folio_clear_hugetlb_vmemmap_optimized(folio);
- }
- return ret;
- }
- /**
- * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
- * @h: struct hstate.
- * @folio: the folio whose vmemmap pages will be optimized.
- *
- * This function only tries to optimize @folio's vmemmap pages and does not
- * guarantee that the optimization will succeed after it returns. The caller
- * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
- * vmemmap pages have been optimized.
- */
- void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
- {
- LIST_HEAD(vmemmap_pages);
- __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
- free_vmemmap_page_list(&vmemmap_pages);
- }
- static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
- {
- unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
- unsigned long vmemmap_reuse;
- if (!vmemmap_should_optimize_folio(h, folio))
- return 0;
- vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
- vmemmap_reuse = vmemmap_start;
- vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
- /*
- * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
- * @vmemmap_end]
- */
- return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
- }
- void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
- {
- struct folio *folio;
- LIST_HEAD(vmemmap_pages);
- unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
- list_for_each_entry(folio, folio_list, lru) {
- int ret = hugetlb_vmemmap_split_folio(h, folio);
- /*
- * Spliting the PMD requires allocating a page, thus lets fail
- * early once we encounter the first OOM. No point in retrying
- * as it can be dynamically done on remap with the memory
- * we get back from the vmemmap deduplication.
- */
- if (ret == -ENOMEM)
- break;
- }
- flush_tlb_all();
- list_for_each_entry(folio, folio_list, lru) {
- int ret;
- ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
- /* only need to synchronize_rcu() once for each batch */
- flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
- /*
- * Pages to be freed may have been accumulated. If we
- * encounter an ENOMEM, free what we have and try again.
- * This can occur in the case that both spliting fails
- * halfway and head page allocation also failed. In this
- * case __hugetlb_vmemmap_optimize_folio() would free memory
- * allowing more vmemmap remaps to occur.
- */
- if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
- flush_tlb_all();
- free_vmemmap_page_list(&vmemmap_pages);
- INIT_LIST_HEAD(&vmemmap_pages);
- __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
- }
- }
- flush_tlb_all();
- free_vmemmap_page_list(&vmemmap_pages);
- }
- static struct ctl_table hugetlb_vmemmap_sysctls[] = {
- {
- .procname = "hugetlb_optimize_vmemmap",
- .data = &vmemmap_optimize_enabled,
- .maxlen = sizeof(vmemmap_optimize_enabled),
- .mode = 0644,
- .proc_handler = proc_dobool,
- },
- };
- static int __init hugetlb_vmemmap_init(void)
- {
- const struct hstate *h;
- /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
- BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
- for_each_hstate(h) {
- if (hugetlb_vmemmap_optimizable(h)) {
- register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
- break;
- }
- }
- return 0;
- }
- late_initcall(hugetlb_vmemmap_init);
|