internal.h 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /* internal.h: mm/ internal definitions
  3. *
  4. * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  5. * Written by David Howells (dhowells@redhat.com)
  6. */
  7. #ifndef __MM_INTERNAL_H
  8. #define __MM_INTERNAL_H
  9. #include <linux/fs.h>
  10. #include <linux/khugepaged.h>
  11. #include <linux/mm.h>
  12. #include <linux/mm_inline.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/rmap.h>
  15. #include <linux/swap.h>
  16. #include <linux/swapops.h>
  17. #include <linux/swap_cgroup.h>
  18. #include <linux/tracepoint-defs.h>
  19. /* Internal core VMA manipulation functions. */
  20. #include "vma.h"
  21. struct folio_batch;
  22. /*
  23. * The set of flags that only affect watermark checking and reclaim
  24. * behaviour. This is used by the MM to obey the caller constraints
  25. * about IO, FS and watermark checking while ignoring placement
  26. * hints such as HIGHMEM usage.
  27. */
  28. #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
  29. __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
  30. __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
  31. __GFP_NOLOCKDEP)
  32. /* The GFP flags allowed during early boot */
  33. #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
  34. /* Control allocation cpuset and node placement constraints */
  35. #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
  36. /* Do not use these with a slab allocator */
  37. #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
  38. /*
  39. * Different from WARN_ON_ONCE(), no warning will be issued
  40. * when we specify __GFP_NOWARN.
  41. */
  42. #define WARN_ON_ONCE_GFP(cond, gfp) ({ \
  43. static bool __section(".data..once") __warned; \
  44. int __ret_warn_once = !!(cond); \
  45. \
  46. if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
  47. __warned = true; \
  48. WARN_ON(1); \
  49. } \
  50. unlikely(__ret_warn_once); \
  51. })
  52. void page_writeback_init(void);
  53. /*
  54. * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
  55. * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
  56. * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
  57. * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
  58. */
  59. #define ENTIRELY_MAPPED 0x800000
  60. #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1)
  61. /*
  62. * Flags passed to __show_mem() and show_free_areas() to suppress output in
  63. * various contexts.
  64. */
  65. #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
  66. /*
  67. * How many individual pages have an elevated _mapcount. Excludes
  68. * the folio's entire_mapcount.
  69. *
  70. * Don't use this function outside of debugging code.
  71. */
  72. static inline int folio_nr_pages_mapped(const struct folio *folio)
  73. {
  74. return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
  75. }
  76. /*
  77. * Retrieve the first entry of a folio based on a provided entry within the
  78. * folio. We cannot rely on folio->swap as there is no guarantee that it has
  79. * been initialized. Used for calling arch_swap_restore()
  80. */
  81. static inline swp_entry_t folio_swap(swp_entry_t entry,
  82. const struct folio *folio)
  83. {
  84. swp_entry_t swap = {
  85. .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
  86. };
  87. return swap;
  88. }
  89. static inline void *folio_raw_mapping(const struct folio *folio)
  90. {
  91. unsigned long mapping = (unsigned long)folio->mapping;
  92. return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
  93. }
  94. /*
  95. * This is a file-backed mapping, and is about to be memory mapped - invoke its
  96. * mmap hook and safely handle error conditions. On error, VMA hooks will be
  97. * mutated.
  98. *
  99. * @file: File which backs the mapping.
  100. * @vma: VMA which we are mapping.
  101. *
  102. * Returns: 0 if success, error otherwise.
  103. */
  104. static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
  105. {
  106. int err = call_mmap(file, vma);
  107. if (likely(!err))
  108. return 0;
  109. /*
  110. * OK, we tried to call the file hook for mmap(), but an error
  111. * arose. The mapping is in an inconsistent state and we most not invoke
  112. * any further hooks on it.
  113. */
  114. vma->vm_ops = &vma_dummy_vm_ops;
  115. return err;
  116. }
  117. /*
  118. * If the VMA has a close hook then close it, and since closing it might leave
  119. * it in an inconsistent state which makes the use of any hooks suspect, clear
  120. * them down by installing dummy empty hooks.
  121. */
  122. static inline void vma_close(struct vm_area_struct *vma)
  123. {
  124. if (vma->vm_ops && vma->vm_ops->close) {
  125. vma->vm_ops->close(vma);
  126. /*
  127. * The mapping is in an inconsistent state, and no further hooks
  128. * may be invoked upon it.
  129. */
  130. vma->vm_ops = &vma_dummy_vm_ops;
  131. }
  132. }
  133. #ifdef CONFIG_MMU
  134. /* Flags for folio_pte_batch(). */
  135. typedef int __bitwise fpb_t;
  136. /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
  137. #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
  138. /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
  139. #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
  140. static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
  141. {
  142. if (flags & FPB_IGNORE_DIRTY)
  143. pte = pte_mkclean(pte);
  144. if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
  145. pte = pte_clear_soft_dirty(pte);
  146. return pte_wrprotect(pte_mkold(pte));
  147. }
  148. /**
  149. * folio_pte_batch - detect a PTE batch for a large folio
  150. * @folio: The large folio to detect a PTE batch for.
  151. * @addr: The user virtual address the first page is mapped at.
  152. * @start_ptep: Page table pointer for the first entry.
  153. * @pte: Page table entry for the first page.
  154. * @max_nr: The maximum number of table entries to consider.
  155. * @flags: Flags to modify the PTE batch semantics.
  156. * @any_writable: Optional pointer to indicate whether any entry except the
  157. * first one is writable.
  158. * @any_young: Optional pointer to indicate whether any entry except the
  159. * first one is young.
  160. * @any_dirty: Optional pointer to indicate whether any entry except the
  161. * first one is dirty.
  162. *
  163. * Detect a PTE batch: consecutive (present) PTEs that map consecutive
  164. * pages of the same large folio.
  165. *
  166. * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
  167. * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
  168. * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
  169. *
  170. * start_ptep must map any page of the folio. max_nr must be at least one and
  171. * must be limited by the caller so scanning cannot exceed a single page table.
  172. *
  173. * Return: the number of table entries in the batch.
  174. */
  175. static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
  176. pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
  177. bool *any_writable, bool *any_young, bool *any_dirty)
  178. {
  179. pte_t expected_pte, *ptep;
  180. bool writable, young, dirty;
  181. int nr, cur_nr;
  182. if (any_writable)
  183. *any_writable = false;
  184. if (any_young)
  185. *any_young = false;
  186. if (any_dirty)
  187. *any_dirty = false;
  188. VM_WARN_ON_FOLIO(!pte_present(pte), folio);
  189. VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
  190. VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
  191. /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
  192. max_nr = min_t(unsigned long, max_nr,
  193. folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
  194. nr = pte_batch_hint(start_ptep, pte);
  195. expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
  196. ptep = start_ptep + nr;
  197. while (nr < max_nr) {
  198. pte = ptep_get(ptep);
  199. if (any_writable)
  200. writable = !!pte_write(pte);
  201. if (any_young)
  202. young = !!pte_young(pte);
  203. if (any_dirty)
  204. dirty = !!pte_dirty(pte);
  205. pte = __pte_batch_clear_ignored(pte, flags);
  206. if (!pte_same(pte, expected_pte))
  207. break;
  208. if (any_writable)
  209. *any_writable |= writable;
  210. if (any_young)
  211. *any_young |= young;
  212. if (any_dirty)
  213. *any_dirty |= dirty;
  214. cur_nr = pte_batch_hint(ptep, pte);
  215. expected_pte = pte_advance_pfn(expected_pte, cur_nr);
  216. ptep += cur_nr;
  217. nr += cur_nr;
  218. }
  219. return min(nr, max_nr);
  220. }
  221. /**
  222. * pte_move_swp_offset - Move the swap entry offset field of a swap pte
  223. * forward or backward by delta
  224. * @pte: The initial pte state; is_swap_pte(pte) must be true and
  225. * non_swap_entry() must be false.
  226. * @delta: The direction and the offset we are moving; forward if delta
  227. * is positive; backward if delta is negative
  228. *
  229. * Moves the swap offset, while maintaining all other fields, including
  230. * swap type, and any swp pte bits. The resulting pte is returned.
  231. */
  232. static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
  233. {
  234. swp_entry_t entry = pte_to_swp_entry(pte);
  235. pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
  236. (swp_offset(entry) + delta)));
  237. if (pte_swp_soft_dirty(pte))
  238. new = pte_swp_mksoft_dirty(new);
  239. if (pte_swp_exclusive(pte))
  240. new = pte_swp_mkexclusive(new);
  241. if (pte_swp_uffd_wp(pte))
  242. new = pte_swp_mkuffd_wp(new);
  243. return new;
  244. }
  245. /**
  246. * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
  247. * @pte: The initial pte state; is_swap_pte(pte) must be true and
  248. * non_swap_entry() must be false.
  249. *
  250. * Increments the swap offset, while maintaining all other fields, including
  251. * swap type, and any swp pte bits. The resulting pte is returned.
  252. */
  253. static inline pte_t pte_next_swp_offset(pte_t pte)
  254. {
  255. return pte_move_swp_offset(pte, 1);
  256. }
  257. /**
  258. * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
  259. * @start_ptep: Page table pointer for the first entry.
  260. * @max_nr: The maximum number of table entries to consider.
  261. * @pte: Page table entry for the first entry.
  262. *
  263. * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
  264. * containing swap entries all with consecutive offsets and targeting the same
  265. * swap type, all with matching swp pte bits.
  266. *
  267. * max_nr must be at least one and must be limited by the caller so scanning
  268. * cannot exceed a single page table.
  269. *
  270. * Return: the number of table entries in the batch.
  271. */
  272. static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
  273. {
  274. pte_t expected_pte = pte_next_swp_offset(pte);
  275. const pte_t *end_ptep = start_ptep + max_nr;
  276. swp_entry_t entry = pte_to_swp_entry(pte);
  277. pte_t *ptep = start_ptep + 1;
  278. unsigned short cgroup_id;
  279. VM_WARN_ON(max_nr < 1);
  280. VM_WARN_ON(!is_swap_pte(pte));
  281. VM_WARN_ON(non_swap_entry(entry));
  282. cgroup_id = lookup_swap_cgroup_id(entry);
  283. while (ptep < end_ptep) {
  284. pte = ptep_get(ptep);
  285. if (!pte_same(pte, expected_pte))
  286. break;
  287. if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
  288. break;
  289. expected_pte = pte_next_swp_offset(expected_pte);
  290. ptep++;
  291. }
  292. return ptep - start_ptep;
  293. }
  294. #endif /* CONFIG_MMU */
  295. void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
  296. int nr_throttled);
  297. static inline void acct_reclaim_writeback(struct folio *folio)
  298. {
  299. pg_data_t *pgdat = folio_pgdat(folio);
  300. int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
  301. if (nr_throttled)
  302. __acct_reclaim_writeback(pgdat, folio, nr_throttled);
  303. }
  304. static inline void wake_throttle_isolated(pg_data_t *pgdat)
  305. {
  306. wait_queue_head_t *wqh;
  307. wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
  308. if (waitqueue_active(wqh))
  309. wake_up(wqh);
  310. }
  311. vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
  312. static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
  313. {
  314. vm_fault_t ret = __vmf_anon_prepare(vmf);
  315. if (unlikely(ret & VM_FAULT_RETRY))
  316. vma_end_read(vmf->vma);
  317. return ret;
  318. }
  319. vm_fault_t do_swap_page(struct vm_fault *vmf);
  320. void folio_rotate_reclaimable(struct folio *folio);
  321. bool __folio_end_writeback(struct folio *folio);
  322. void deactivate_file_folio(struct folio *folio);
  323. void folio_activate(struct folio *folio);
  324. void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
  325. struct vm_area_struct *start_vma, unsigned long floor,
  326. unsigned long ceiling, bool mm_wr_locked);
  327. void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
  328. struct zap_details;
  329. void unmap_page_range(struct mmu_gather *tlb,
  330. struct vm_area_struct *vma,
  331. unsigned long addr, unsigned long end,
  332. struct zap_details *details);
  333. void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
  334. unsigned int order);
  335. void force_page_cache_ra(struct readahead_control *, unsigned long nr);
  336. static inline void force_page_cache_readahead(struct address_space *mapping,
  337. struct file *file, pgoff_t index, unsigned long nr_to_read)
  338. {
  339. DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
  340. force_page_cache_ra(&ractl, nr_to_read);
  341. }
  342. unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
  343. pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
  344. unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
  345. pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
  346. void filemap_free_folio(struct address_space *mapping, struct folio *folio);
  347. int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
  348. bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
  349. loff_t end);
  350. long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
  351. unsigned long mapping_try_invalidate(struct address_space *mapping,
  352. pgoff_t start, pgoff_t end, unsigned long *nr_failed);
  353. /**
  354. * folio_evictable - Test whether a folio is evictable.
  355. * @folio: The folio to test.
  356. *
  357. * Test whether @folio is evictable -- i.e., should be placed on
  358. * active/inactive lists vs unevictable list.
  359. *
  360. * Reasons folio might not be evictable:
  361. * 1. folio's mapping marked unevictable
  362. * 2. One of the pages in the folio is part of an mlocked VMA
  363. */
  364. static inline bool folio_evictable(struct folio *folio)
  365. {
  366. bool ret;
  367. /* Prevent address_space of inode and swap cache from being freed */
  368. rcu_read_lock();
  369. ret = !mapping_unevictable(folio_mapping(folio)) &&
  370. !folio_test_mlocked(folio);
  371. rcu_read_unlock();
  372. return ret;
  373. }
  374. /*
  375. * Turn a non-refcounted page (->_refcount == 0) into refcounted with
  376. * a count of one.
  377. */
  378. static inline void set_page_refcounted(struct page *page)
  379. {
  380. VM_BUG_ON_PAGE(PageTail(page), page);
  381. VM_BUG_ON_PAGE(page_ref_count(page), page);
  382. set_page_count(page, 1);
  383. }
  384. /*
  385. * Return true if a folio needs ->release_folio() calling upon it.
  386. */
  387. static inline bool folio_needs_release(struct folio *folio)
  388. {
  389. struct address_space *mapping = folio_mapping(folio);
  390. return folio_has_private(folio) ||
  391. (mapping && mapping_release_always(mapping));
  392. }
  393. extern unsigned long highest_memmap_pfn;
  394. /*
  395. * Maximum number of reclaim retries without progress before the OOM
  396. * killer is consider the only way forward.
  397. */
  398. #define MAX_RECLAIM_RETRIES 16
  399. /*
  400. * in mm/vmscan.c:
  401. */
  402. bool folio_isolate_lru(struct folio *folio);
  403. void folio_putback_lru(struct folio *folio);
  404. extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
  405. /*
  406. * in mm/rmap.c:
  407. */
  408. pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  409. /*
  410. * in mm/page_alloc.c
  411. */
  412. #define K(x) ((x) << (PAGE_SHIFT-10))
  413. extern char * const zone_names[MAX_NR_ZONES];
  414. /* perform sanity checks on struct pages being allocated or freed */
  415. DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
  416. extern int min_free_kbytes;
  417. void setup_per_zone_wmarks(void);
  418. void calculate_min_free_kbytes(void);
  419. int __meminit init_per_zone_wmark_min(void);
  420. void page_alloc_sysctl_init(void);
  421. /*
  422. * Structure for holding the mostly immutable allocation parameters passed
  423. * between functions involved in allocations, including the alloc_pages*
  424. * family of functions.
  425. *
  426. * nodemask, migratetype and highest_zoneidx are initialized only once in
  427. * __alloc_pages() and then never change.
  428. *
  429. * zonelist, preferred_zone and highest_zoneidx are set first in
  430. * __alloc_pages() for the fast path, and might be later changed
  431. * in __alloc_pages_slowpath(). All other functions pass the whole structure
  432. * by a const pointer.
  433. */
  434. struct alloc_context {
  435. struct zonelist *zonelist;
  436. nodemask_t *nodemask;
  437. struct zoneref *preferred_zoneref;
  438. int migratetype;
  439. /*
  440. * highest_zoneidx represents highest usable zone index of
  441. * the allocation request. Due to the nature of the zone,
  442. * memory on lower zone than the highest_zoneidx will be
  443. * protected by lowmem_reserve[highest_zoneidx].
  444. *
  445. * highest_zoneidx is also used by reclaim/compaction to limit
  446. * the target zone since higher zone than this index cannot be
  447. * usable for this allocation request.
  448. */
  449. enum zone_type highest_zoneidx;
  450. bool spread_dirty_pages;
  451. };
  452. /*
  453. * This function returns the order of a free page in the buddy system. In
  454. * general, page_zone(page)->lock must be held by the caller to prevent the
  455. * page from being allocated in parallel and returning garbage as the order.
  456. * If a caller does not hold page_zone(page)->lock, it must guarantee that the
  457. * page cannot be allocated or merged in parallel. Alternatively, it must
  458. * handle invalid values gracefully, and use buddy_order_unsafe() below.
  459. */
  460. static inline unsigned int buddy_order(struct page *page)
  461. {
  462. /* PageBuddy() must be checked by the caller */
  463. return page_private(page);
  464. }
  465. /*
  466. * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
  467. * PageBuddy() should be checked first by the caller to minimize race window,
  468. * and invalid values must be handled gracefully.
  469. *
  470. * READ_ONCE is used so that if the caller assigns the result into a local
  471. * variable and e.g. tests it for valid range before using, the compiler cannot
  472. * decide to remove the variable and inline the page_private(page) multiple
  473. * times, potentially observing different values in the tests and the actual
  474. * use of the result.
  475. */
  476. #define buddy_order_unsafe(page) READ_ONCE(page_private(page))
  477. /*
  478. * This function checks whether a page is free && is the buddy
  479. * we can coalesce a page and its buddy if
  480. * (a) the buddy is not in a hole (check before calling!) &&
  481. * (b) the buddy is in the buddy system &&
  482. * (c) a page and its buddy have the same order &&
  483. * (d) a page and its buddy are in the same zone.
  484. *
  485. * For recording whether a page is in the buddy system, we set PageBuddy.
  486. * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
  487. *
  488. * For recording page's order, we use page_private(page).
  489. */
  490. static inline bool page_is_buddy(struct page *page, struct page *buddy,
  491. unsigned int order)
  492. {
  493. if (!page_is_guard(buddy) && !PageBuddy(buddy))
  494. return false;
  495. if (buddy_order(buddy) != order)
  496. return false;
  497. /*
  498. * zone check is done late to avoid uselessly calculating
  499. * zone/node ids for pages that could never merge.
  500. */
  501. if (page_zone_id(page) != page_zone_id(buddy))
  502. return false;
  503. VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
  504. return true;
  505. }
  506. /*
  507. * Locate the struct page for both the matching buddy in our
  508. * pair (buddy1) and the combined O(n+1) page they form (page).
  509. *
  510. * 1) Any buddy B1 will have an order O twin B2 which satisfies
  511. * the following equation:
  512. * B2 = B1 ^ (1 << O)
  513. * For example, if the starting buddy (buddy2) is #8 its order
  514. * 1 buddy is #10:
  515. * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  516. *
  517. * 2) Any buddy B will have an order O+1 parent P which
  518. * satisfies the following equation:
  519. * P = B & ~(1 << O)
  520. *
  521. * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
  522. */
  523. static inline unsigned long
  524. __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
  525. {
  526. return page_pfn ^ (1 << order);
  527. }
  528. /*
  529. * Find the buddy of @page and validate it.
  530. * @page: The input page
  531. * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
  532. * function is used in the performance-critical __free_one_page().
  533. * @order: The order of the page
  534. * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
  535. * page_to_pfn().
  536. *
  537. * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
  538. * not the same as @page. The validation is necessary before use it.
  539. *
  540. * Return: the found buddy page or NULL if not found.
  541. */
  542. static inline struct page *find_buddy_page_pfn(struct page *page,
  543. unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
  544. {
  545. unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
  546. struct page *buddy;
  547. buddy = page + (__buddy_pfn - pfn);
  548. if (buddy_pfn)
  549. *buddy_pfn = __buddy_pfn;
  550. if (page_is_buddy(page, buddy, order))
  551. return buddy;
  552. return NULL;
  553. }
  554. extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
  555. unsigned long end_pfn, struct zone *zone);
  556. static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
  557. unsigned long end_pfn, struct zone *zone)
  558. {
  559. if (zone->contiguous)
  560. return pfn_to_page(start_pfn);
  561. return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
  562. }
  563. void set_zone_contiguous(struct zone *zone);
  564. static inline void clear_zone_contiguous(struct zone *zone)
  565. {
  566. zone->contiguous = false;
  567. }
  568. extern int __isolate_free_page(struct page *page, unsigned int order);
  569. extern void __putback_isolated_page(struct page *page, unsigned int order,
  570. int mt);
  571. extern void memblock_free_pages(struct page *page, unsigned long pfn,
  572. unsigned int order);
  573. extern void __free_pages_core(struct page *page, unsigned int order,
  574. enum meminit_context context);
  575. /*
  576. * This will have no effect, other than possibly generating a warning, if the
  577. * caller passes in a non-large folio.
  578. */
  579. static inline void folio_set_order(struct folio *folio, unsigned int order)
  580. {
  581. if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
  582. return;
  583. folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
  584. #ifdef CONFIG_64BIT
  585. folio->_folio_nr_pages = 1U << order;
  586. #endif
  587. }
  588. bool __folio_unqueue_deferred_split(struct folio *folio);
  589. static inline bool folio_unqueue_deferred_split(struct folio *folio)
  590. {
  591. if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
  592. return false;
  593. /*
  594. * At this point, there is no one trying to add the folio to
  595. * deferred_list. If folio is not in deferred_list, it's safe
  596. * to check without acquiring the split_queue_lock.
  597. */
  598. if (data_race(list_empty(&folio->_deferred_list)))
  599. return false;
  600. return __folio_unqueue_deferred_split(folio);
  601. }
  602. static inline struct folio *page_rmappable_folio(struct page *page)
  603. {
  604. struct folio *folio = (struct folio *)page;
  605. if (folio && folio_test_large(folio))
  606. folio_set_large_rmappable(folio);
  607. return folio;
  608. }
  609. static inline void prep_compound_head(struct page *page, unsigned int order)
  610. {
  611. struct folio *folio = (struct folio *)page;
  612. folio_set_order(folio, order);
  613. atomic_set(&folio->_large_mapcount, -1);
  614. atomic_set(&folio->_entire_mapcount, -1);
  615. atomic_set(&folio->_nr_pages_mapped, 0);
  616. atomic_set(&folio->_pincount, 0);
  617. if (order > 1)
  618. INIT_LIST_HEAD(&folio->_deferred_list);
  619. }
  620. static inline void prep_compound_tail(struct page *head, int tail_idx)
  621. {
  622. struct page *p = head + tail_idx;
  623. p->mapping = TAIL_MAPPING;
  624. set_compound_head(p, head);
  625. set_page_private(p, 0);
  626. }
  627. extern void prep_compound_page(struct page *page, unsigned int order);
  628. extern void post_alloc_hook(struct page *page, unsigned int order,
  629. gfp_t gfp_flags);
  630. extern bool free_pages_prepare(struct page *page, unsigned int order);
  631. extern int user_min_free_kbytes;
  632. void free_unref_page(struct page *page, unsigned int order);
  633. void free_unref_folios(struct folio_batch *fbatch);
  634. extern void zone_pcp_reset(struct zone *zone);
  635. extern void zone_pcp_disable(struct zone *zone);
  636. extern void zone_pcp_enable(struct zone *zone);
  637. extern void zone_pcp_init(struct zone *zone);
  638. extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
  639. phys_addr_t min_addr,
  640. int nid, bool exact_nid);
  641. void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
  642. unsigned long, enum meminit_context, struct vmem_altmap *, int);
  643. #if defined CONFIG_COMPACTION || defined CONFIG_CMA
  644. /*
  645. * in mm/compaction.c
  646. */
  647. /*
  648. * compact_control is used to track pages being migrated and the free pages
  649. * they are being migrated to during memory compaction. The free_pfn starts
  650. * at the end of a zone and migrate_pfn begins at the start. Movable pages
  651. * are moved to the end of a zone during a compaction run and the run
  652. * completes when free_pfn <= migrate_pfn
  653. */
  654. struct compact_control {
  655. struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */
  656. struct list_head migratepages; /* List of pages being migrated */
  657. unsigned int nr_freepages; /* Number of isolated free pages */
  658. unsigned int nr_migratepages; /* Number of pages to migrate */
  659. unsigned long free_pfn; /* isolate_freepages search base */
  660. /*
  661. * Acts as an in/out parameter to page isolation for migration.
  662. * isolate_migratepages uses it as a search base.
  663. * isolate_migratepages_block will update the value to the next pfn
  664. * after the last isolated one.
  665. */
  666. unsigned long migrate_pfn;
  667. unsigned long fast_start_pfn; /* a pfn to start linear scan from */
  668. struct zone *zone;
  669. unsigned long total_migrate_scanned;
  670. unsigned long total_free_scanned;
  671. unsigned short fast_search_fail;/* failures to use free list searches */
  672. short search_order; /* order to start a fast search at */
  673. const gfp_t gfp_mask; /* gfp mask of a direct compactor */
  674. int order; /* order a direct compactor needs */
  675. int migratetype; /* migratetype of direct compactor */
  676. const unsigned int alloc_flags; /* alloc flags of a direct compactor */
  677. const int highest_zoneidx; /* zone index of a direct compactor */
  678. enum migrate_mode mode; /* Async or sync migration mode */
  679. bool ignore_skip_hint; /* Scan blocks even if marked skip */
  680. bool no_set_skip_hint; /* Don't mark blocks for skipping */
  681. bool ignore_block_suitable; /* Scan blocks considered unsuitable */
  682. bool direct_compaction; /* False from kcompactd or /proc/... */
  683. bool proactive_compaction; /* kcompactd proactive compaction */
  684. bool whole_zone; /* Whole zone should/has been scanned */
  685. bool contended; /* Signal lock contention */
  686. bool finish_pageblock; /* Scan the remainder of a pageblock. Used
  687. * when there are potentially transient
  688. * isolation or migration failures to
  689. * ensure forward progress.
  690. */
  691. bool alloc_contig; /* alloc_contig_range allocation */
  692. };
  693. /*
  694. * Used in direct compaction when a page should be taken from the freelists
  695. * immediately when one is created during the free path.
  696. */
  697. struct capture_control {
  698. struct compact_control *cc;
  699. struct page *page;
  700. };
  701. unsigned long
  702. isolate_freepages_range(struct compact_control *cc,
  703. unsigned long start_pfn, unsigned long end_pfn);
  704. int
  705. isolate_migratepages_range(struct compact_control *cc,
  706. unsigned long low_pfn, unsigned long end_pfn);
  707. int __alloc_contig_migrate_range(struct compact_control *cc,
  708. unsigned long start, unsigned long end,
  709. int migratetype);
  710. /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
  711. void init_cma_reserved_pageblock(struct page *page);
  712. #endif /* CONFIG_COMPACTION || CONFIG_CMA */
  713. int find_suitable_fallback(struct free_area *area, unsigned int order,
  714. int migratetype, bool only_stealable, bool *can_steal);
  715. static inline bool free_area_empty(struct free_area *area, int migratetype)
  716. {
  717. return list_empty(&area->free_list[migratetype]);
  718. }
  719. /* mm/util.c */
  720. struct anon_vma *folio_anon_vma(struct folio *folio);
  721. #ifdef CONFIG_MMU
  722. void unmap_mapping_folio(struct folio *folio);
  723. extern long populate_vma_page_range(struct vm_area_struct *vma,
  724. unsigned long start, unsigned long end, int *locked);
  725. extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
  726. unsigned long end, bool write, int *locked);
  727. extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
  728. unsigned long bytes);
  729. /*
  730. * NOTE: This function can't tell whether the folio is "fully mapped" in the
  731. * range.
  732. * "fully mapped" means all the pages of folio is associated with the page
  733. * table of range while this function just check whether the folio range is
  734. * within the range [start, end). Function caller needs to do page table
  735. * check if it cares about the page table association.
  736. *
  737. * Typical usage (like mlock or madvise) is:
  738. * Caller knows at least 1 page of folio is associated with page table of VMA
  739. * and the range [start, end) is intersect with the VMA range. Caller wants
  740. * to know whether the folio is fully associated with the range. It calls
  741. * this function to check whether the folio is in the range first. Then checks
  742. * the page table to know whether the folio is fully mapped to the range.
  743. */
  744. static inline bool
  745. folio_within_range(struct folio *folio, struct vm_area_struct *vma,
  746. unsigned long start, unsigned long end)
  747. {
  748. pgoff_t pgoff, addr;
  749. unsigned long vma_pglen = vma_pages(vma);
  750. VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
  751. if (start > end)
  752. return false;
  753. if (start < vma->vm_start)
  754. start = vma->vm_start;
  755. if (end > vma->vm_end)
  756. end = vma->vm_end;
  757. pgoff = folio_pgoff(folio);
  758. /* if folio start address is not in vma range */
  759. if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
  760. return false;
  761. addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  762. return !(addr < start || end - addr < folio_size(folio));
  763. }
  764. static inline bool
  765. folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
  766. {
  767. return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
  768. }
  769. /*
  770. * mlock_vma_folio() and munlock_vma_folio():
  771. * should be called with vma's mmap_lock held for read or write,
  772. * under page table lock for the pte/pmd being added or removed.
  773. *
  774. * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
  775. * the end of folio_remove_rmap_*(); but new anon folios are managed by
  776. * folio_add_lru_vma() calling mlock_new_folio().
  777. */
  778. void mlock_folio(struct folio *folio);
  779. static inline void mlock_vma_folio(struct folio *folio,
  780. struct vm_area_struct *vma)
  781. {
  782. /*
  783. * The VM_SPECIAL check here serves two purposes.
  784. * 1) VM_IO check prevents migration from double-counting during mlock.
  785. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
  786. * is never left set on a VM_SPECIAL vma, there is an interval while
  787. * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
  788. * still be set while VM_SPECIAL bits are added: so ignore it then.
  789. */
  790. if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
  791. mlock_folio(folio);
  792. }
  793. void munlock_folio(struct folio *folio);
  794. static inline void munlock_vma_folio(struct folio *folio,
  795. struct vm_area_struct *vma)
  796. {
  797. /*
  798. * munlock if the function is called. Ideally, we should only
  799. * do munlock if any page of folio is unmapped from VMA and
  800. * cause folio not fully mapped to VMA.
  801. *
  802. * But it's not easy to confirm that's the situation. So we
  803. * always munlock the folio and page reclaim will correct it
  804. * if it's wrong.
  805. */
  806. if (unlikely(vma->vm_flags & VM_LOCKED))
  807. munlock_folio(folio);
  808. }
  809. void mlock_new_folio(struct folio *folio);
  810. bool need_mlock_drain(int cpu);
  811. void mlock_drain_local(void);
  812. void mlock_drain_remote(int cpu);
  813. extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
  814. /**
  815. * vma_address - Find the virtual address a page range is mapped at
  816. * @vma: The vma which maps this object.
  817. * @pgoff: The page offset within its object.
  818. * @nr_pages: The number of pages to consider.
  819. *
  820. * If any page in this range is mapped by this VMA, return the first address
  821. * where any of these pages appear. Otherwise, return -EFAULT.
  822. */
  823. static inline unsigned long vma_address(struct vm_area_struct *vma,
  824. pgoff_t pgoff, unsigned long nr_pages)
  825. {
  826. unsigned long address;
  827. if (pgoff >= vma->vm_pgoff) {
  828. address = vma->vm_start +
  829. ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  830. /* Check for address beyond vma (or wrapped through 0?) */
  831. if (address < vma->vm_start || address >= vma->vm_end)
  832. address = -EFAULT;
  833. } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
  834. /* Test above avoids possibility of wrap to 0 on 32-bit */
  835. address = vma->vm_start;
  836. } else {
  837. address = -EFAULT;
  838. }
  839. return address;
  840. }
  841. /*
  842. * Then at what user virtual address will none of the range be found in vma?
  843. * Assumes that vma_address() already returned a good starting address.
  844. */
  845. static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
  846. {
  847. struct vm_area_struct *vma = pvmw->vma;
  848. pgoff_t pgoff;
  849. unsigned long address;
  850. /* Common case, plus ->pgoff is invalid for KSM */
  851. if (pvmw->nr_pages == 1)
  852. return pvmw->address + PAGE_SIZE;
  853. pgoff = pvmw->pgoff + pvmw->nr_pages;
  854. address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  855. /* Check for address beyond vma (or wrapped through 0?) */
  856. if (address < vma->vm_start || address > vma->vm_end)
  857. address = vma->vm_end;
  858. return address;
  859. }
  860. static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
  861. struct file *fpin)
  862. {
  863. int flags = vmf->flags;
  864. if (fpin)
  865. return fpin;
  866. /*
  867. * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
  868. * anything, so we only pin the file and drop the mmap_lock if only
  869. * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
  870. */
  871. if (fault_flag_allow_retry_first(flags) &&
  872. !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
  873. fpin = get_file(vmf->vma->vm_file);
  874. release_fault_lock(vmf);
  875. }
  876. return fpin;
  877. }
  878. #else /* !CONFIG_MMU */
  879. static inline void unmap_mapping_folio(struct folio *folio) { }
  880. static inline void mlock_new_folio(struct folio *folio) { }
  881. static inline bool need_mlock_drain(int cpu) { return false; }
  882. static inline void mlock_drain_local(void) { }
  883. static inline void mlock_drain_remote(int cpu) { }
  884. static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
  885. {
  886. }
  887. #endif /* !CONFIG_MMU */
  888. /* Memory initialisation debug and verification */
  889. #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
  890. DECLARE_STATIC_KEY_TRUE(deferred_pages);
  891. bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
  892. #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
  893. enum mminit_level {
  894. MMINIT_WARNING,
  895. MMINIT_VERIFY,
  896. MMINIT_TRACE
  897. };
  898. #ifdef CONFIG_DEBUG_MEMORY_INIT
  899. extern int mminit_loglevel;
  900. #define mminit_dprintk(level, prefix, fmt, arg...) \
  901. do { \
  902. if (level < mminit_loglevel) { \
  903. if (level <= MMINIT_WARNING) \
  904. pr_warn("mminit::" prefix " " fmt, ##arg); \
  905. else \
  906. printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
  907. } \
  908. } while (0)
  909. extern void mminit_verify_pageflags_layout(void);
  910. extern void mminit_verify_zonelist(void);
  911. #else
  912. static inline void mminit_dprintk(enum mminit_level level,
  913. const char *prefix, const char *fmt, ...)
  914. {
  915. }
  916. static inline void mminit_verify_pageflags_layout(void)
  917. {
  918. }
  919. static inline void mminit_verify_zonelist(void)
  920. {
  921. }
  922. #endif /* CONFIG_DEBUG_MEMORY_INIT */
  923. #define NODE_RECLAIM_NOSCAN -2
  924. #define NODE_RECLAIM_FULL -1
  925. #define NODE_RECLAIM_SOME 0
  926. #define NODE_RECLAIM_SUCCESS 1
  927. #ifdef CONFIG_NUMA
  928. extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
  929. extern int find_next_best_node(int node, nodemask_t *used_node_mask);
  930. #else
  931. static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
  932. unsigned int order)
  933. {
  934. return NODE_RECLAIM_NOSCAN;
  935. }
  936. static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
  937. {
  938. return NUMA_NO_NODE;
  939. }
  940. #endif
  941. /*
  942. * mm/memory-failure.c
  943. */
  944. #ifdef CONFIG_MEMORY_FAILURE
  945. int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
  946. void shake_folio(struct folio *folio);
  947. extern int hwpoison_filter(struct page *p);
  948. extern u32 hwpoison_filter_dev_major;
  949. extern u32 hwpoison_filter_dev_minor;
  950. extern u64 hwpoison_filter_flags_mask;
  951. extern u64 hwpoison_filter_flags_value;
  952. extern u64 hwpoison_filter_memcg;
  953. extern u32 hwpoison_filter_enable;
  954. #define MAGIC_HWPOISON 0x48575053U /* HWPS */
  955. void SetPageHWPoisonTakenOff(struct page *page);
  956. void ClearPageHWPoisonTakenOff(struct page *page);
  957. bool take_page_off_buddy(struct page *page);
  958. bool put_page_back_buddy(struct page *page);
  959. struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
  960. void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
  961. struct vm_area_struct *vma, struct list_head *to_kill,
  962. unsigned long ksm_addr);
  963. unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  964. #else
  965. static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
  966. {
  967. return -EBUSY;
  968. }
  969. #endif
  970. extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
  971. unsigned long, unsigned long,
  972. unsigned long, unsigned long);
  973. extern void set_pageblock_order(void);
  974. struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
  975. unsigned long reclaim_pages(struct list_head *folio_list);
  976. unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  977. struct list_head *folio_list);
  978. /* The ALLOC_WMARK bits are used as an index to zone->watermark */
  979. #define ALLOC_WMARK_MIN WMARK_MIN
  980. #define ALLOC_WMARK_LOW WMARK_LOW
  981. #define ALLOC_WMARK_HIGH WMARK_HIGH
  982. #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
  983. /* Mask to get the watermark bits */
  984. #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
  985. /*
  986. * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
  987. * cannot assume a reduced access to memory reserves is sufficient for
  988. * !MMU
  989. */
  990. #ifdef CONFIG_MMU
  991. #define ALLOC_OOM 0x08
  992. #else
  993. #define ALLOC_OOM ALLOC_NO_WATERMARKS
  994. #endif
  995. #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
  996. * to 25% of the min watermark or
  997. * 62.5% if __GFP_HIGH is set.
  998. */
  999. #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
  1000. * of the min watermark.
  1001. */
  1002. #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
  1003. #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
  1004. #ifdef CONFIG_ZONE_DMA32
  1005. #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
  1006. #else
  1007. #define ALLOC_NOFRAGMENT 0x0
  1008. #endif
  1009. #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
  1010. #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
  1011. /* Flags that allow allocations below the min watermark. */
  1012. #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
  1013. enum ttu_flags;
  1014. struct tlbflush_unmap_batch;
  1015. /*
  1016. * only for MM internal work items which do not depend on
  1017. * any allocations or locks which might depend on allocations
  1018. */
  1019. extern struct workqueue_struct *mm_percpu_wq;
  1020. #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  1021. void try_to_unmap_flush(void);
  1022. void try_to_unmap_flush_dirty(void);
  1023. void flush_tlb_batched_pending(struct mm_struct *mm);
  1024. #else
  1025. static inline void try_to_unmap_flush(void)
  1026. {
  1027. }
  1028. static inline void try_to_unmap_flush_dirty(void)
  1029. {
  1030. }
  1031. static inline void flush_tlb_batched_pending(struct mm_struct *mm)
  1032. {
  1033. }
  1034. #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  1035. extern const struct trace_print_flags pageflag_names[];
  1036. extern const struct trace_print_flags vmaflag_names[];
  1037. extern const struct trace_print_flags gfpflag_names[];
  1038. static inline bool is_migrate_highatomic(enum migratetype migratetype)
  1039. {
  1040. return migratetype == MIGRATE_HIGHATOMIC;
  1041. }
  1042. void setup_zone_pageset(struct zone *zone);
  1043. struct migration_target_control {
  1044. int nid; /* preferred node id */
  1045. nodemask_t *nmask;
  1046. gfp_t gfp_mask;
  1047. enum migrate_reason reason;
  1048. };
  1049. /*
  1050. * mm/filemap.c
  1051. */
  1052. size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
  1053. struct folio *folio, loff_t fpos, size_t size);
  1054. /*
  1055. * mm/vmalloc.c
  1056. */
  1057. #ifdef CONFIG_MMU
  1058. void __init vmalloc_init(void);
  1059. int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
  1060. pgprot_t prot, struct page **pages, unsigned int page_shift);
  1061. #else
  1062. static inline void vmalloc_init(void)
  1063. {
  1064. }
  1065. static inline
  1066. int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
  1067. pgprot_t prot, struct page **pages, unsigned int page_shift)
  1068. {
  1069. return -EINVAL;
  1070. }
  1071. #endif
  1072. int __must_check __vmap_pages_range_noflush(unsigned long addr,
  1073. unsigned long end, pgprot_t prot,
  1074. struct page **pages, unsigned int page_shift);
  1075. void vunmap_range_noflush(unsigned long start, unsigned long end);
  1076. void __vunmap_range_noflush(unsigned long start, unsigned long end);
  1077. int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
  1078. unsigned long addr, int *flags, bool writable,
  1079. int *last_cpupid);
  1080. void free_zone_device_folio(struct folio *folio);
  1081. int migrate_device_coherent_folio(struct folio *folio);
  1082. /*
  1083. * mm/gup.c
  1084. */
  1085. int __must_check try_grab_folio(struct folio *folio, int refs,
  1086. unsigned int flags);
  1087. /*
  1088. * mm/huge_memory.c
  1089. */
  1090. void touch_pud(struct vm_area_struct *vma, unsigned long addr,
  1091. pud_t *pud, bool write);
  1092. void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
  1093. pmd_t *pmd, bool write);
  1094. enum {
  1095. /* mark page accessed */
  1096. FOLL_TOUCH = 1 << 16,
  1097. /* a retry, previous pass started an IO */
  1098. FOLL_TRIED = 1 << 17,
  1099. /* we are working on non-current tsk/mm */
  1100. FOLL_REMOTE = 1 << 18,
  1101. /* pages must be released via unpin_user_page */
  1102. FOLL_PIN = 1 << 19,
  1103. /* gup_fast: prevent fall-back to slow gup */
  1104. FOLL_FAST_ONLY = 1 << 20,
  1105. /* allow unlocking the mmap lock */
  1106. FOLL_UNLOCKABLE = 1 << 21,
  1107. /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
  1108. FOLL_MADV_POPULATE = 1 << 22,
  1109. };
  1110. #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
  1111. FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
  1112. FOLL_MADV_POPULATE)
  1113. /*
  1114. * Indicates for which pages that are write-protected in the page table,
  1115. * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
  1116. * GUP pin will remain consistent with the pages mapped into the page tables
  1117. * of the MM.
  1118. *
  1119. * Temporary unmapping of PageAnonExclusive() pages or clearing of
  1120. * PageAnonExclusive() has to protect against concurrent GUP:
  1121. * * Ordinary GUP: Using the PT lock
  1122. * * GUP-fast and fork(): mm->write_protect_seq
  1123. * * GUP-fast and KSM or temporary unmapping (swap, migration): see
  1124. * folio_try_share_anon_rmap_*()
  1125. *
  1126. * Must be called with the (sub)page that's actually referenced via the
  1127. * page table entry, which might not necessarily be the head page for a
  1128. * PTE-mapped THP.
  1129. *
  1130. * If the vma is NULL, we're coming from the GUP-fast path and might have
  1131. * to fallback to the slow path just to lookup the vma.
  1132. */
  1133. static inline bool gup_must_unshare(struct vm_area_struct *vma,
  1134. unsigned int flags, struct page *page)
  1135. {
  1136. /*
  1137. * FOLL_WRITE is implicitly handled correctly as the page table entry
  1138. * has to be writable -- and if it references (part of) an anonymous
  1139. * folio, that part is required to be marked exclusive.
  1140. */
  1141. if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
  1142. return false;
  1143. /*
  1144. * Note: PageAnon(page) is stable until the page is actually getting
  1145. * freed.
  1146. */
  1147. if (!PageAnon(page)) {
  1148. /*
  1149. * We only care about R/O long-term pining: R/O short-term
  1150. * pinning does not have the semantics to observe successive
  1151. * changes through the process page tables.
  1152. */
  1153. if (!(flags & FOLL_LONGTERM))
  1154. return false;
  1155. /* We really need the vma ... */
  1156. if (!vma)
  1157. return true;
  1158. /*
  1159. * ... because we only care about writable private ("COW")
  1160. * mappings where we have to break COW early.
  1161. */
  1162. return is_cow_mapping(vma->vm_flags);
  1163. }
  1164. /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
  1165. if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
  1166. smp_rmb();
  1167. /*
  1168. * Note that PageKsm() pages cannot be exclusive, and consequently,
  1169. * cannot get pinned.
  1170. */
  1171. return !PageAnonExclusive(page);
  1172. }
  1173. extern bool mirrored_kernelcore;
  1174. extern bool memblock_has_mirror(void);
  1175. static __always_inline void vma_set_range(struct vm_area_struct *vma,
  1176. unsigned long start, unsigned long end,
  1177. pgoff_t pgoff)
  1178. {
  1179. vma->vm_start = start;
  1180. vma->vm_end = end;
  1181. vma->vm_pgoff = pgoff;
  1182. }
  1183. static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
  1184. {
  1185. /*
  1186. * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
  1187. * enablements, because when without soft-dirty being compiled in,
  1188. * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
  1189. * will be constantly true.
  1190. */
  1191. if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
  1192. return false;
  1193. /*
  1194. * Soft-dirty is kind of special: its tracking is enabled when the
  1195. * vma flags not set.
  1196. */
  1197. return !(vma->vm_flags & VM_SOFTDIRTY);
  1198. }
  1199. static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
  1200. {
  1201. return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
  1202. }
  1203. static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
  1204. {
  1205. return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
  1206. }
  1207. void __meminit __init_single_page(struct page *page, unsigned long pfn,
  1208. unsigned long zone, int nid);
  1209. /* shrinker related functions */
  1210. unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
  1211. int priority);
  1212. #ifdef CONFIG_64BIT
  1213. static inline int can_do_mseal(unsigned long flags)
  1214. {
  1215. if (flags)
  1216. return -EINVAL;
  1217. return 0;
  1218. }
  1219. #else
  1220. static inline int can_do_mseal(unsigned long flags)
  1221. {
  1222. return -EPERM;
  1223. }
  1224. #endif
  1225. #ifdef CONFIG_SHRINKER_DEBUG
  1226. static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
  1227. struct shrinker *shrinker, const char *fmt, va_list ap)
  1228. {
  1229. shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
  1230. return shrinker->name ? 0 : -ENOMEM;
  1231. }
  1232. static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
  1233. {
  1234. kfree_const(shrinker->name);
  1235. shrinker->name = NULL;
  1236. }
  1237. extern int shrinker_debugfs_add(struct shrinker *shrinker);
  1238. extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
  1239. int *debugfs_id);
  1240. extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
  1241. int debugfs_id);
  1242. #else /* CONFIG_SHRINKER_DEBUG */
  1243. static inline int shrinker_debugfs_add(struct shrinker *shrinker)
  1244. {
  1245. return 0;
  1246. }
  1247. static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
  1248. const char *fmt, va_list ap)
  1249. {
  1250. return 0;
  1251. }
  1252. static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
  1253. {
  1254. }
  1255. static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
  1256. int *debugfs_id)
  1257. {
  1258. *debugfs_id = -1;
  1259. return NULL;
  1260. }
  1261. static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
  1262. int debugfs_id)
  1263. {
  1264. }
  1265. #endif /* CONFIG_SHRINKER_DEBUG */
  1266. /* Only track the nodes of mappings with shadow entries */
  1267. void workingset_update_node(struct xa_node *node);
  1268. extern struct list_lru shadow_nodes;
  1269. /* mremap.c */
  1270. unsigned long move_page_tables(struct vm_area_struct *vma,
  1271. unsigned long old_addr, struct vm_area_struct *new_vma,
  1272. unsigned long new_addr, unsigned long len,
  1273. bool need_rmap_locks, bool for_stack);
  1274. #ifdef CONFIG_UNACCEPTED_MEMORY
  1275. void accept_page(struct page *page);
  1276. #else /* CONFIG_UNACCEPTED_MEMORY */
  1277. static inline void accept_page(struct page *page)
  1278. {
  1279. }
  1280. #endif /* CONFIG_UNACCEPTED_MEMORY */
  1281. #endif /* __MM_INTERNAL_H */