madvise.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/madvise.c
  4. *
  5. * Copyright (C) 1999 Linus Torvalds
  6. * Copyright (C) 2002 Christoph Hellwig
  7. */
  8. #include <linux/mman.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/syscalls.h>
  11. #include <linux/mempolicy.h>
  12. #include <linux/page-isolation.h>
  13. #include <linux/page_idle.h>
  14. #include <linux/userfaultfd_k.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/falloc.h>
  17. #include <linux/fadvise.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/mm.h>
  20. #include <linux/mm_inline.h>
  21. #include <linux/string.h>
  22. #include <linux/uio.h>
  23. #include <linux/ksm.h>
  24. #include <linux/fs.h>
  25. #include <linux/file.h>
  26. #include <linux/blkdev.h>
  27. #include <linux/backing-dev.h>
  28. #include <linux/pagewalk.h>
  29. #include <linux/swap.h>
  30. #include <linux/swapops.h>
  31. #include <linux/shmem_fs.h>
  32. #include <linux/mmu_notifier.h>
  33. #include <asm/tlb.h>
  34. #include "internal.h"
  35. #include "swap.h"
  36. struct madvise_walk_private {
  37. struct mmu_gather *tlb;
  38. bool pageout;
  39. };
  40. /*
  41. * Any behaviour which results in changes to the vma->vm_flags needs to
  42. * take mmap_lock for writing. Others, which simply traverse vmas, need
  43. * to only take it for reading.
  44. */
  45. static int madvise_need_mmap_write(int behavior)
  46. {
  47. switch (behavior) {
  48. case MADV_REMOVE:
  49. case MADV_WILLNEED:
  50. case MADV_DONTNEED:
  51. case MADV_DONTNEED_LOCKED:
  52. case MADV_COLD:
  53. case MADV_PAGEOUT:
  54. case MADV_FREE:
  55. case MADV_POPULATE_READ:
  56. case MADV_POPULATE_WRITE:
  57. case MADV_COLLAPSE:
  58. return 0;
  59. default:
  60. /* be safe, default to 1. list exceptions explicitly */
  61. return 1;
  62. }
  63. }
  64. #ifdef CONFIG_ANON_VMA_NAME
  65. struct anon_vma_name *anon_vma_name_alloc(const char *name)
  66. {
  67. struct anon_vma_name *anon_name;
  68. size_t count;
  69. /* Add 1 for NUL terminator at the end of the anon_name->name */
  70. count = strlen(name) + 1;
  71. anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
  72. if (anon_name) {
  73. kref_init(&anon_name->kref);
  74. memcpy(anon_name->name, name, count);
  75. }
  76. return anon_name;
  77. }
  78. void anon_vma_name_free(struct kref *kref)
  79. {
  80. struct anon_vma_name *anon_name =
  81. container_of(kref, struct anon_vma_name, kref);
  82. kfree(anon_name);
  83. }
  84. struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  85. {
  86. mmap_assert_locked(vma->vm_mm);
  87. return vma->anon_name;
  88. }
  89. /* mmap_lock should be write-locked */
  90. static int replace_anon_vma_name(struct vm_area_struct *vma,
  91. struct anon_vma_name *anon_name)
  92. {
  93. struct anon_vma_name *orig_name = anon_vma_name(vma);
  94. if (!anon_name) {
  95. vma->anon_name = NULL;
  96. anon_vma_name_put(orig_name);
  97. return 0;
  98. }
  99. if (anon_vma_name_eq(orig_name, anon_name))
  100. return 0;
  101. vma->anon_name = anon_vma_name_reuse(anon_name);
  102. anon_vma_name_put(orig_name);
  103. return 0;
  104. }
  105. #else /* CONFIG_ANON_VMA_NAME */
  106. static int replace_anon_vma_name(struct vm_area_struct *vma,
  107. struct anon_vma_name *anon_name)
  108. {
  109. if (anon_name)
  110. return -EINVAL;
  111. return 0;
  112. }
  113. #endif /* CONFIG_ANON_VMA_NAME */
  114. /*
  115. * Update the vm_flags on region of a vma, splitting it or merging it as
  116. * necessary. Must be called with mmap_lock held for writing;
  117. * Caller should ensure anon_name stability by raising its refcount even when
  118. * anon_name belongs to a valid vma because this function might free that vma.
  119. */
  120. static int madvise_update_vma(struct vm_area_struct *vma,
  121. struct vm_area_struct **prev, unsigned long start,
  122. unsigned long end, unsigned long new_flags,
  123. struct anon_vma_name *anon_name)
  124. {
  125. struct mm_struct *mm = vma->vm_mm;
  126. int error;
  127. VMA_ITERATOR(vmi, mm, start);
  128. if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
  129. *prev = vma;
  130. return 0;
  131. }
  132. vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
  133. anon_name);
  134. if (IS_ERR(vma))
  135. return PTR_ERR(vma);
  136. *prev = vma;
  137. /* vm_flags is protected by the mmap_lock held in write mode. */
  138. vma_start_write(vma);
  139. vm_flags_reset(vma, new_flags);
  140. if (!vma->vm_file || vma_is_anon_shmem(vma)) {
  141. error = replace_anon_vma_name(vma, anon_name);
  142. if (error)
  143. return error;
  144. }
  145. return 0;
  146. }
  147. #ifdef CONFIG_SWAP
  148. static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  149. unsigned long end, struct mm_walk *walk)
  150. {
  151. struct vm_area_struct *vma = walk->private;
  152. struct swap_iocb *splug = NULL;
  153. pte_t *ptep = NULL;
  154. spinlock_t *ptl;
  155. unsigned long addr;
  156. for (addr = start; addr < end; addr += PAGE_SIZE) {
  157. pte_t pte;
  158. swp_entry_t entry;
  159. struct folio *folio;
  160. if (!ptep++) {
  161. ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  162. if (!ptep)
  163. break;
  164. }
  165. pte = ptep_get(ptep);
  166. if (!is_swap_pte(pte))
  167. continue;
  168. entry = pte_to_swp_entry(pte);
  169. if (unlikely(non_swap_entry(entry)))
  170. continue;
  171. pte_unmap_unlock(ptep, ptl);
  172. ptep = NULL;
  173. folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  174. vma, addr, &splug);
  175. if (folio)
  176. folio_put(folio);
  177. }
  178. if (ptep)
  179. pte_unmap_unlock(ptep, ptl);
  180. swap_read_unplug(splug);
  181. cond_resched();
  182. return 0;
  183. }
  184. static const struct mm_walk_ops swapin_walk_ops = {
  185. .pmd_entry = swapin_walk_pmd_entry,
  186. .walk_lock = PGWALK_RDLOCK,
  187. };
  188. static void shmem_swapin_range(struct vm_area_struct *vma,
  189. unsigned long start, unsigned long end,
  190. struct address_space *mapping)
  191. {
  192. XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
  193. pgoff_t end_index = linear_page_index(vma, end) - 1;
  194. struct folio *folio;
  195. struct swap_iocb *splug = NULL;
  196. rcu_read_lock();
  197. xas_for_each(&xas, folio, end_index) {
  198. unsigned long addr;
  199. swp_entry_t entry;
  200. if (!xa_is_value(folio))
  201. continue;
  202. entry = radix_to_swp_entry(folio);
  203. /* There might be swapin error entries in shmem mapping. */
  204. if (non_swap_entry(entry))
  205. continue;
  206. addr = vma->vm_start +
  207. ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
  208. xas_pause(&xas);
  209. rcu_read_unlock();
  210. folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
  211. vma, addr, &splug);
  212. if (folio)
  213. folio_put(folio);
  214. rcu_read_lock();
  215. }
  216. rcu_read_unlock();
  217. swap_read_unplug(splug);
  218. }
  219. #endif /* CONFIG_SWAP */
  220. /*
  221. * Schedule all required I/O operations. Do not wait for completion.
  222. */
  223. static long madvise_willneed(struct vm_area_struct *vma,
  224. struct vm_area_struct **prev,
  225. unsigned long start, unsigned long end)
  226. {
  227. struct mm_struct *mm = vma->vm_mm;
  228. struct file *file = vma->vm_file;
  229. loff_t offset;
  230. *prev = vma;
  231. #ifdef CONFIG_SWAP
  232. if (!file) {
  233. walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  234. lru_add_drain(); /* Push any new pages onto the LRU now */
  235. return 0;
  236. }
  237. if (shmem_mapping(file->f_mapping)) {
  238. shmem_swapin_range(vma, start, end, file->f_mapping);
  239. lru_add_drain(); /* Push any new pages onto the LRU now */
  240. return 0;
  241. }
  242. #else
  243. if (!file)
  244. return -EBADF;
  245. #endif
  246. if (IS_DAX(file_inode(file))) {
  247. /* no bad return value, but ignore advice */
  248. return 0;
  249. }
  250. /*
  251. * Filesystem's fadvise may need to take various locks. We need to
  252. * explicitly grab a reference because the vma (and hence the
  253. * vma's reference to the file) can go away as soon as we drop
  254. * mmap_lock.
  255. */
  256. *prev = NULL; /* tell sys_madvise we drop mmap_lock */
  257. get_file(file);
  258. offset = (loff_t)(start - vma->vm_start)
  259. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  260. mmap_read_unlock(mm);
  261. vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  262. fput(file);
  263. mmap_read_lock(mm);
  264. return 0;
  265. }
  266. static inline bool can_do_file_pageout(struct vm_area_struct *vma)
  267. {
  268. if (!vma->vm_file)
  269. return false;
  270. /*
  271. * paging out pagecache only for non-anonymous mappings that correspond
  272. * to the files the calling process could (if tried) open for writing;
  273. * otherwise we'd be including shared non-exclusive mappings, which
  274. * opens a side channel.
  275. */
  276. return inode_owner_or_capable(&nop_mnt_idmap,
  277. file_inode(vma->vm_file)) ||
  278. file_permission(vma->vm_file, MAY_WRITE) == 0;
  279. }
  280. static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
  281. struct folio *folio, pte_t *ptep,
  282. pte_t pte, bool *any_young,
  283. bool *any_dirty)
  284. {
  285. const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
  286. int max_nr = (end - addr) / PAGE_SIZE;
  287. return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
  288. any_young, any_dirty);
  289. }
  290. static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  291. unsigned long addr, unsigned long end,
  292. struct mm_walk *walk)
  293. {
  294. struct madvise_walk_private *private = walk->private;
  295. struct mmu_gather *tlb = private->tlb;
  296. bool pageout = private->pageout;
  297. struct mm_struct *mm = tlb->mm;
  298. struct vm_area_struct *vma = walk->vma;
  299. pte_t *start_pte, *pte, ptent;
  300. spinlock_t *ptl;
  301. struct folio *folio = NULL;
  302. LIST_HEAD(folio_list);
  303. bool pageout_anon_only_filter;
  304. unsigned int batch_count = 0;
  305. int nr;
  306. if (fatal_signal_pending(current))
  307. return -EINTR;
  308. pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
  309. !can_do_file_pageout(vma);
  310. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  311. if (pmd_trans_huge(*pmd)) {
  312. pmd_t orig_pmd;
  313. unsigned long next = pmd_addr_end(addr, end);
  314. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  315. ptl = pmd_trans_huge_lock(pmd, vma);
  316. if (!ptl)
  317. return 0;
  318. orig_pmd = *pmd;
  319. if (is_huge_zero_pmd(orig_pmd))
  320. goto huge_unlock;
  321. if (unlikely(!pmd_present(orig_pmd))) {
  322. VM_BUG_ON(thp_migration_supported() &&
  323. !is_pmd_migration_entry(orig_pmd));
  324. goto huge_unlock;
  325. }
  326. folio = pmd_folio(orig_pmd);
  327. /* Do not interfere with other mappings of this folio */
  328. if (folio_likely_mapped_shared(folio))
  329. goto huge_unlock;
  330. if (pageout_anon_only_filter && !folio_test_anon(folio))
  331. goto huge_unlock;
  332. if (next - addr != HPAGE_PMD_SIZE) {
  333. int err;
  334. folio_get(folio);
  335. spin_unlock(ptl);
  336. folio_lock(folio);
  337. err = split_folio(folio);
  338. folio_unlock(folio);
  339. folio_put(folio);
  340. if (!err)
  341. goto regular_folio;
  342. return 0;
  343. }
  344. if (!pageout && pmd_young(orig_pmd)) {
  345. pmdp_invalidate(vma, addr, pmd);
  346. orig_pmd = pmd_mkold(orig_pmd);
  347. set_pmd_at(mm, addr, pmd, orig_pmd);
  348. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  349. }
  350. folio_clear_referenced(folio);
  351. folio_test_clear_young(folio);
  352. if (folio_test_active(folio))
  353. folio_set_workingset(folio);
  354. if (pageout) {
  355. if (folio_isolate_lru(folio)) {
  356. if (folio_test_unevictable(folio))
  357. folio_putback_lru(folio);
  358. else
  359. list_add(&folio->lru, &folio_list);
  360. }
  361. } else
  362. folio_deactivate(folio);
  363. huge_unlock:
  364. spin_unlock(ptl);
  365. if (pageout)
  366. reclaim_pages(&folio_list);
  367. return 0;
  368. }
  369. regular_folio:
  370. #endif
  371. tlb_change_page_size(tlb, PAGE_SIZE);
  372. restart:
  373. start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  374. if (!start_pte)
  375. return 0;
  376. flush_tlb_batched_pending(mm);
  377. arch_enter_lazy_mmu_mode();
  378. for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
  379. nr = 1;
  380. ptent = ptep_get(pte);
  381. if (++batch_count == SWAP_CLUSTER_MAX) {
  382. batch_count = 0;
  383. if (need_resched()) {
  384. arch_leave_lazy_mmu_mode();
  385. pte_unmap_unlock(start_pte, ptl);
  386. cond_resched();
  387. goto restart;
  388. }
  389. }
  390. if (pte_none(ptent))
  391. continue;
  392. if (!pte_present(ptent))
  393. continue;
  394. folio = vm_normal_folio(vma, addr, ptent);
  395. if (!folio || folio_is_zone_device(folio))
  396. continue;
  397. /*
  398. * If we encounter a large folio, only split it if it is not
  399. * fully mapped within the range we are operating on. Otherwise
  400. * leave it as is so that it can be swapped out whole. If we
  401. * fail to split a folio, leave it in place and advance to the
  402. * next pte in the range.
  403. */
  404. if (folio_test_large(folio)) {
  405. bool any_young;
  406. nr = madvise_folio_pte_batch(addr, end, folio, pte,
  407. ptent, &any_young, NULL);
  408. if (any_young)
  409. ptent = pte_mkyoung(ptent);
  410. if (nr < folio_nr_pages(folio)) {
  411. int err;
  412. if (folio_likely_mapped_shared(folio))
  413. continue;
  414. if (pageout_anon_only_filter && !folio_test_anon(folio))
  415. continue;
  416. if (!folio_trylock(folio))
  417. continue;
  418. folio_get(folio);
  419. arch_leave_lazy_mmu_mode();
  420. pte_unmap_unlock(start_pte, ptl);
  421. start_pte = NULL;
  422. err = split_folio(folio);
  423. folio_unlock(folio);
  424. folio_put(folio);
  425. start_pte = pte =
  426. pte_offset_map_lock(mm, pmd, addr, &ptl);
  427. if (!start_pte)
  428. break;
  429. arch_enter_lazy_mmu_mode();
  430. if (!err)
  431. nr = 0;
  432. continue;
  433. }
  434. }
  435. /*
  436. * Do not interfere with other mappings of this folio and
  437. * non-LRU folio. If we have a large folio at this point, we
  438. * know it is fully mapped so if its mapcount is the same as its
  439. * number of pages, it must be exclusive.
  440. */
  441. if (!folio_test_lru(folio) ||
  442. folio_mapcount(folio) != folio_nr_pages(folio))
  443. continue;
  444. if (pageout_anon_only_filter && !folio_test_anon(folio))
  445. continue;
  446. if (!pageout && pte_young(ptent)) {
  447. clear_young_dirty_ptes(vma, addr, pte, nr,
  448. CYDP_CLEAR_YOUNG);
  449. tlb_remove_tlb_entries(tlb, pte, nr, addr);
  450. }
  451. /*
  452. * We are deactivating a folio for accelerating reclaiming.
  453. * VM couldn't reclaim the folio unless we clear PG_young.
  454. * As a side effect, it makes confuse idle-page tracking
  455. * because they will miss recent referenced history.
  456. */
  457. folio_clear_referenced(folio);
  458. folio_test_clear_young(folio);
  459. if (folio_test_active(folio))
  460. folio_set_workingset(folio);
  461. if (pageout) {
  462. if (folio_isolate_lru(folio)) {
  463. if (folio_test_unevictable(folio))
  464. folio_putback_lru(folio);
  465. else
  466. list_add(&folio->lru, &folio_list);
  467. }
  468. } else
  469. folio_deactivate(folio);
  470. }
  471. if (start_pte) {
  472. arch_leave_lazy_mmu_mode();
  473. pte_unmap_unlock(start_pte, ptl);
  474. }
  475. if (pageout)
  476. reclaim_pages(&folio_list);
  477. cond_resched();
  478. return 0;
  479. }
  480. static const struct mm_walk_ops cold_walk_ops = {
  481. .pmd_entry = madvise_cold_or_pageout_pte_range,
  482. .walk_lock = PGWALK_RDLOCK,
  483. };
  484. static void madvise_cold_page_range(struct mmu_gather *tlb,
  485. struct vm_area_struct *vma,
  486. unsigned long addr, unsigned long end)
  487. {
  488. struct madvise_walk_private walk_private = {
  489. .pageout = false,
  490. .tlb = tlb,
  491. };
  492. tlb_start_vma(tlb, vma);
  493. walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
  494. tlb_end_vma(tlb, vma);
  495. }
  496. static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
  497. {
  498. return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
  499. }
  500. static long madvise_cold(struct vm_area_struct *vma,
  501. struct vm_area_struct **prev,
  502. unsigned long start_addr, unsigned long end_addr)
  503. {
  504. struct mm_struct *mm = vma->vm_mm;
  505. struct mmu_gather tlb;
  506. *prev = vma;
  507. if (!can_madv_lru_vma(vma))
  508. return -EINVAL;
  509. lru_add_drain();
  510. tlb_gather_mmu(&tlb, mm);
  511. madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
  512. tlb_finish_mmu(&tlb);
  513. return 0;
  514. }
  515. static void madvise_pageout_page_range(struct mmu_gather *tlb,
  516. struct vm_area_struct *vma,
  517. unsigned long addr, unsigned long end)
  518. {
  519. struct madvise_walk_private walk_private = {
  520. .pageout = true,
  521. .tlb = tlb,
  522. };
  523. tlb_start_vma(tlb, vma);
  524. walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
  525. tlb_end_vma(tlb, vma);
  526. }
  527. static long madvise_pageout(struct vm_area_struct *vma,
  528. struct vm_area_struct **prev,
  529. unsigned long start_addr, unsigned long end_addr)
  530. {
  531. struct mm_struct *mm = vma->vm_mm;
  532. struct mmu_gather tlb;
  533. *prev = vma;
  534. if (!can_madv_lru_vma(vma))
  535. return -EINVAL;
  536. /*
  537. * If the VMA belongs to a private file mapping, there can be private
  538. * dirty pages which can be paged out if even this process is neither
  539. * owner nor write capable of the file. We allow private file mappings
  540. * further to pageout dirty anon pages.
  541. */
  542. if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
  543. (vma->vm_flags & VM_MAYSHARE)))
  544. return 0;
  545. lru_add_drain();
  546. tlb_gather_mmu(&tlb, mm);
  547. madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
  548. tlb_finish_mmu(&tlb);
  549. return 0;
  550. }
  551. static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  552. unsigned long end, struct mm_walk *walk)
  553. {
  554. const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
  555. struct mmu_gather *tlb = walk->private;
  556. struct mm_struct *mm = tlb->mm;
  557. struct vm_area_struct *vma = walk->vma;
  558. spinlock_t *ptl;
  559. pte_t *start_pte, *pte, ptent;
  560. struct folio *folio;
  561. int nr_swap = 0;
  562. unsigned long next;
  563. int nr, max_nr;
  564. next = pmd_addr_end(addr, end);
  565. if (pmd_trans_huge(*pmd))
  566. if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  567. return 0;
  568. tlb_change_page_size(tlb, PAGE_SIZE);
  569. start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  570. if (!start_pte)
  571. return 0;
  572. flush_tlb_batched_pending(mm);
  573. arch_enter_lazy_mmu_mode();
  574. for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
  575. nr = 1;
  576. ptent = ptep_get(pte);
  577. if (pte_none(ptent))
  578. continue;
  579. /*
  580. * If the pte has swp_entry, just clear page table to
  581. * prevent swap-in which is more expensive rather than
  582. * (page allocation + zeroing).
  583. */
  584. if (!pte_present(ptent)) {
  585. swp_entry_t entry;
  586. entry = pte_to_swp_entry(ptent);
  587. if (!non_swap_entry(entry)) {
  588. max_nr = (end - addr) / PAGE_SIZE;
  589. nr = swap_pte_batch(pte, max_nr, ptent);
  590. nr_swap -= nr;
  591. free_swap_and_cache_nr(entry, nr);
  592. clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
  593. } else if (is_hwpoison_entry(entry) ||
  594. is_poisoned_swp_entry(entry)) {
  595. pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  596. }
  597. continue;
  598. }
  599. folio = vm_normal_folio(vma, addr, ptent);
  600. if (!folio || folio_is_zone_device(folio))
  601. continue;
  602. /*
  603. * If we encounter a large folio, only split it if it is not
  604. * fully mapped within the range we are operating on. Otherwise
  605. * leave it as is so that it can be marked as lazyfree. If we
  606. * fail to split a folio, leave it in place and advance to the
  607. * next pte in the range.
  608. */
  609. if (folio_test_large(folio)) {
  610. bool any_young, any_dirty;
  611. nr = madvise_folio_pte_batch(addr, end, folio, pte,
  612. ptent, &any_young, &any_dirty);
  613. if (nr < folio_nr_pages(folio)) {
  614. int err;
  615. if (folio_likely_mapped_shared(folio))
  616. continue;
  617. if (!folio_trylock(folio))
  618. continue;
  619. folio_get(folio);
  620. arch_leave_lazy_mmu_mode();
  621. pte_unmap_unlock(start_pte, ptl);
  622. start_pte = NULL;
  623. err = split_folio(folio);
  624. folio_unlock(folio);
  625. folio_put(folio);
  626. pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  627. start_pte = pte;
  628. if (!start_pte)
  629. break;
  630. arch_enter_lazy_mmu_mode();
  631. if (!err)
  632. nr = 0;
  633. continue;
  634. }
  635. if (any_young)
  636. ptent = pte_mkyoung(ptent);
  637. if (any_dirty)
  638. ptent = pte_mkdirty(ptent);
  639. }
  640. if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
  641. if (!folio_trylock(folio))
  642. continue;
  643. /*
  644. * If we have a large folio at this point, we know it is
  645. * fully mapped so if its mapcount is the same as its
  646. * number of pages, it must be exclusive.
  647. */
  648. if (folio_mapcount(folio) != folio_nr_pages(folio)) {
  649. folio_unlock(folio);
  650. continue;
  651. }
  652. if (folio_test_swapcache(folio) &&
  653. !folio_free_swap(folio)) {
  654. folio_unlock(folio);
  655. continue;
  656. }
  657. folio_clear_dirty(folio);
  658. folio_unlock(folio);
  659. }
  660. if (pte_young(ptent) || pte_dirty(ptent)) {
  661. clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
  662. tlb_remove_tlb_entries(tlb, pte, nr, addr);
  663. }
  664. folio_mark_lazyfree(folio);
  665. }
  666. if (nr_swap)
  667. add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  668. if (start_pte) {
  669. arch_leave_lazy_mmu_mode();
  670. pte_unmap_unlock(start_pte, ptl);
  671. }
  672. cond_resched();
  673. return 0;
  674. }
  675. static const struct mm_walk_ops madvise_free_walk_ops = {
  676. .pmd_entry = madvise_free_pte_range,
  677. .walk_lock = PGWALK_RDLOCK,
  678. };
  679. static int madvise_free_single_vma(struct vm_area_struct *vma,
  680. unsigned long start_addr, unsigned long end_addr)
  681. {
  682. struct mm_struct *mm = vma->vm_mm;
  683. struct mmu_notifier_range range;
  684. struct mmu_gather tlb;
  685. /* MADV_FREE works for only anon vma at the moment */
  686. if (!vma_is_anonymous(vma))
  687. return -EINVAL;
  688. range.start = max(vma->vm_start, start_addr);
  689. if (range.start >= vma->vm_end)
  690. return -EINVAL;
  691. range.end = min(vma->vm_end, end_addr);
  692. if (range.end <= vma->vm_start)
  693. return -EINVAL;
  694. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
  695. range.start, range.end);
  696. lru_add_drain();
  697. tlb_gather_mmu(&tlb, mm);
  698. update_hiwater_rss(mm);
  699. mmu_notifier_invalidate_range_start(&range);
  700. tlb_start_vma(&tlb, vma);
  701. walk_page_range(vma->vm_mm, range.start, range.end,
  702. &madvise_free_walk_ops, &tlb);
  703. tlb_end_vma(&tlb, vma);
  704. mmu_notifier_invalidate_range_end(&range);
  705. tlb_finish_mmu(&tlb);
  706. return 0;
  707. }
  708. /*
  709. * Application no longer needs these pages. If the pages are dirty,
  710. * it's OK to just throw them away. The app will be more careful about
  711. * data it wants to keep. Be sure to free swap resources too. The
  712. * zap_page_range_single call sets things up for shrink_active_list to actually
  713. * free these pages later if no one else has touched them in the meantime,
  714. * although we could add these pages to a global reuse list for
  715. * shrink_active_list to pick up before reclaiming other pages.
  716. *
  717. * NB: This interface discards data rather than pushes it out to swap,
  718. * as some implementations do. This has performance implications for
  719. * applications like large transactional databases which want to discard
  720. * pages in anonymous maps after committing to backing store the data
  721. * that was kept in them. There is no reason to write this data out to
  722. * the swap area if the application is discarding it.
  723. *
  724. * An interface that causes the system to free clean pages and flush
  725. * dirty pages is already available as msync(MS_INVALIDATE).
  726. */
  727. static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  728. unsigned long start, unsigned long end)
  729. {
  730. zap_page_range_single(vma, start, end - start, NULL);
  731. return 0;
  732. }
  733. static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
  734. unsigned long start,
  735. unsigned long *end,
  736. int behavior)
  737. {
  738. if (!is_vm_hugetlb_page(vma)) {
  739. unsigned int forbidden = VM_PFNMAP;
  740. if (behavior != MADV_DONTNEED_LOCKED)
  741. forbidden |= VM_LOCKED;
  742. return !(vma->vm_flags & forbidden);
  743. }
  744. if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
  745. return false;
  746. if (start & ~huge_page_mask(hstate_vma(vma)))
  747. return false;
  748. /*
  749. * Madvise callers expect the length to be rounded up to PAGE_SIZE
  750. * boundaries, and may be unaware that this VMA uses huge pages.
  751. * Avoid unexpected data loss by rounding down the number of
  752. * huge pages freed.
  753. */
  754. *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
  755. return true;
  756. }
  757. static long madvise_dontneed_free(struct vm_area_struct *vma,
  758. struct vm_area_struct **prev,
  759. unsigned long start, unsigned long end,
  760. int behavior)
  761. {
  762. struct mm_struct *mm = vma->vm_mm;
  763. *prev = vma;
  764. if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
  765. return -EINVAL;
  766. if (start == end)
  767. return 0;
  768. if (!userfaultfd_remove(vma, start, end)) {
  769. *prev = NULL; /* mmap_lock has been dropped, prev is stale */
  770. mmap_read_lock(mm);
  771. vma = vma_lookup(mm, start);
  772. if (!vma)
  773. return -ENOMEM;
  774. /*
  775. * Potential end adjustment for hugetlb vma is OK as
  776. * the check below keeps end within vma.
  777. */
  778. if (!madvise_dontneed_free_valid_vma(vma, start, &end,
  779. behavior))
  780. return -EINVAL;
  781. if (end > vma->vm_end) {
  782. /*
  783. * Don't fail if end > vma->vm_end. If the old
  784. * vma was split while the mmap_lock was
  785. * released the effect of the concurrent
  786. * operation may not cause madvise() to
  787. * have an undefined result. There may be an
  788. * adjacent next vma that we'll walk
  789. * next. userfaultfd_remove() will generate an
  790. * UFFD_EVENT_REMOVE repetition on the
  791. * end-vma->vm_end range, but the manager can
  792. * handle a repetition fine.
  793. */
  794. end = vma->vm_end;
  795. }
  796. VM_WARN_ON(start >= end);
  797. }
  798. if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
  799. return madvise_dontneed_single_vma(vma, start, end);
  800. else if (behavior == MADV_FREE)
  801. return madvise_free_single_vma(vma, start, end);
  802. else
  803. return -EINVAL;
  804. }
  805. static long madvise_populate(struct mm_struct *mm, unsigned long start,
  806. unsigned long end, int behavior)
  807. {
  808. const bool write = behavior == MADV_POPULATE_WRITE;
  809. int locked = 1;
  810. long pages;
  811. while (start < end) {
  812. /* Populate (prefault) page tables readable/writable. */
  813. pages = faultin_page_range(mm, start, end, write, &locked);
  814. if (!locked) {
  815. mmap_read_lock(mm);
  816. locked = 1;
  817. }
  818. if (pages < 0) {
  819. switch (pages) {
  820. case -EINTR:
  821. return -EINTR;
  822. case -EINVAL: /* Incompatible mappings / permissions. */
  823. return -EINVAL;
  824. case -EHWPOISON:
  825. return -EHWPOISON;
  826. case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
  827. return -EFAULT;
  828. default:
  829. pr_warn_once("%s: unhandled return value: %ld\n",
  830. __func__, pages);
  831. fallthrough;
  832. case -ENOMEM: /* No VMA or out of memory. */
  833. return -ENOMEM;
  834. }
  835. }
  836. start += pages * PAGE_SIZE;
  837. }
  838. return 0;
  839. }
  840. /*
  841. * Application wants to free up the pages and associated backing store.
  842. * This is effectively punching a hole into the middle of a file.
  843. */
  844. static long madvise_remove(struct vm_area_struct *vma,
  845. struct vm_area_struct **prev,
  846. unsigned long start, unsigned long end)
  847. {
  848. loff_t offset;
  849. int error;
  850. struct file *f;
  851. struct mm_struct *mm = vma->vm_mm;
  852. *prev = NULL; /* tell sys_madvise we drop mmap_lock */
  853. if (vma->vm_flags & VM_LOCKED)
  854. return -EINVAL;
  855. f = vma->vm_file;
  856. if (!f || !f->f_mapping || !f->f_mapping->host) {
  857. return -EINVAL;
  858. }
  859. if (!vma_is_shared_maywrite(vma))
  860. return -EACCES;
  861. offset = (loff_t)(start - vma->vm_start)
  862. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  863. /*
  864. * Filesystem's fallocate may need to take i_rwsem. We need to
  865. * explicitly grab a reference because the vma (and hence the
  866. * vma's reference to the file) can go away as soon as we drop
  867. * mmap_lock.
  868. */
  869. get_file(f);
  870. if (userfaultfd_remove(vma, start, end)) {
  871. /* mmap_lock was not released by userfaultfd_remove() */
  872. mmap_read_unlock(mm);
  873. }
  874. error = vfs_fallocate(f,
  875. FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  876. offset, end - start);
  877. fput(f);
  878. mmap_read_lock(mm);
  879. return error;
  880. }
  881. /*
  882. * Apply an madvise behavior to a region of a vma. madvise_update_vma
  883. * will handle splitting a vm area into separate areas, each area with its own
  884. * behavior.
  885. */
  886. static int madvise_vma_behavior(struct vm_area_struct *vma,
  887. struct vm_area_struct **prev,
  888. unsigned long start, unsigned long end,
  889. unsigned long behavior)
  890. {
  891. int error;
  892. struct anon_vma_name *anon_name;
  893. unsigned long new_flags = vma->vm_flags;
  894. if (unlikely(!can_modify_vma_madv(vma, behavior)))
  895. return -EPERM;
  896. switch (behavior) {
  897. case MADV_REMOVE:
  898. return madvise_remove(vma, prev, start, end);
  899. case MADV_WILLNEED:
  900. return madvise_willneed(vma, prev, start, end);
  901. case MADV_COLD:
  902. return madvise_cold(vma, prev, start, end);
  903. case MADV_PAGEOUT:
  904. return madvise_pageout(vma, prev, start, end);
  905. case MADV_FREE:
  906. case MADV_DONTNEED:
  907. case MADV_DONTNEED_LOCKED:
  908. return madvise_dontneed_free(vma, prev, start, end, behavior);
  909. case MADV_NORMAL:
  910. new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  911. break;
  912. case MADV_SEQUENTIAL:
  913. new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  914. break;
  915. case MADV_RANDOM:
  916. new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  917. break;
  918. case MADV_DONTFORK:
  919. new_flags |= VM_DONTCOPY;
  920. break;
  921. case MADV_DOFORK:
  922. if (vma->vm_flags & VM_IO)
  923. return -EINVAL;
  924. new_flags &= ~VM_DONTCOPY;
  925. break;
  926. case MADV_WIPEONFORK:
  927. /* MADV_WIPEONFORK is only supported on anonymous memory. */
  928. if (vma->vm_file || vma->vm_flags & VM_SHARED)
  929. return -EINVAL;
  930. new_flags |= VM_WIPEONFORK;
  931. break;
  932. case MADV_KEEPONFORK:
  933. if (vma->vm_flags & VM_DROPPABLE)
  934. return -EINVAL;
  935. new_flags &= ~VM_WIPEONFORK;
  936. break;
  937. case MADV_DONTDUMP:
  938. new_flags |= VM_DONTDUMP;
  939. break;
  940. case MADV_DODUMP:
  941. if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
  942. (vma->vm_flags & VM_DROPPABLE))
  943. return -EINVAL;
  944. new_flags &= ~VM_DONTDUMP;
  945. break;
  946. case MADV_MERGEABLE:
  947. case MADV_UNMERGEABLE:
  948. error = ksm_madvise(vma, start, end, behavior, &new_flags);
  949. if (error)
  950. goto out;
  951. break;
  952. case MADV_HUGEPAGE:
  953. case MADV_NOHUGEPAGE:
  954. error = hugepage_madvise(vma, &new_flags, behavior);
  955. if (error)
  956. goto out;
  957. break;
  958. case MADV_COLLAPSE:
  959. return madvise_collapse(vma, prev, start, end);
  960. }
  961. anon_name = anon_vma_name(vma);
  962. anon_vma_name_get(anon_name);
  963. error = madvise_update_vma(vma, prev, start, end, new_flags,
  964. anon_name);
  965. anon_vma_name_put(anon_name);
  966. out:
  967. /*
  968. * madvise() returns EAGAIN if kernel resources, such as
  969. * slab, are temporarily unavailable.
  970. */
  971. if (error == -ENOMEM)
  972. error = -EAGAIN;
  973. return error;
  974. }
  975. #ifdef CONFIG_MEMORY_FAILURE
  976. /*
  977. * Error injection support for memory error handling.
  978. */
  979. static int madvise_inject_error(int behavior,
  980. unsigned long start, unsigned long end)
  981. {
  982. unsigned long size;
  983. if (!capable(CAP_SYS_ADMIN))
  984. return -EPERM;
  985. for (; start < end; start += size) {
  986. unsigned long pfn;
  987. struct page *page;
  988. int ret;
  989. ret = get_user_pages_fast(start, 1, 0, &page);
  990. if (ret != 1)
  991. return ret;
  992. pfn = page_to_pfn(page);
  993. /*
  994. * When soft offlining hugepages, after migrating the page
  995. * we dissolve it, therefore in the second loop "page" will
  996. * no longer be a compound page.
  997. */
  998. size = page_size(compound_head(page));
  999. if (behavior == MADV_SOFT_OFFLINE) {
  1000. pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
  1001. pfn, start);
  1002. ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
  1003. } else {
  1004. pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
  1005. pfn, start);
  1006. ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
  1007. if (ret == -EOPNOTSUPP)
  1008. ret = 0;
  1009. }
  1010. if (ret)
  1011. return ret;
  1012. }
  1013. return 0;
  1014. }
  1015. #endif
  1016. static bool
  1017. madvise_behavior_valid(int behavior)
  1018. {
  1019. switch (behavior) {
  1020. case MADV_DOFORK:
  1021. case MADV_DONTFORK:
  1022. case MADV_NORMAL:
  1023. case MADV_SEQUENTIAL:
  1024. case MADV_RANDOM:
  1025. case MADV_REMOVE:
  1026. case MADV_WILLNEED:
  1027. case MADV_DONTNEED:
  1028. case MADV_DONTNEED_LOCKED:
  1029. case MADV_FREE:
  1030. case MADV_COLD:
  1031. case MADV_PAGEOUT:
  1032. case MADV_POPULATE_READ:
  1033. case MADV_POPULATE_WRITE:
  1034. #ifdef CONFIG_KSM
  1035. case MADV_MERGEABLE:
  1036. case MADV_UNMERGEABLE:
  1037. #endif
  1038. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1039. case MADV_HUGEPAGE:
  1040. case MADV_NOHUGEPAGE:
  1041. case MADV_COLLAPSE:
  1042. #endif
  1043. case MADV_DONTDUMP:
  1044. case MADV_DODUMP:
  1045. case MADV_WIPEONFORK:
  1046. case MADV_KEEPONFORK:
  1047. #ifdef CONFIG_MEMORY_FAILURE
  1048. case MADV_SOFT_OFFLINE:
  1049. case MADV_HWPOISON:
  1050. #endif
  1051. return true;
  1052. default:
  1053. return false;
  1054. }
  1055. }
  1056. static bool process_madvise_behavior_valid(int behavior)
  1057. {
  1058. switch (behavior) {
  1059. case MADV_COLD:
  1060. case MADV_PAGEOUT:
  1061. case MADV_WILLNEED:
  1062. case MADV_COLLAPSE:
  1063. return true;
  1064. default:
  1065. return false;
  1066. }
  1067. }
  1068. /*
  1069. * Walk the vmas in range [start,end), and call the visit function on each one.
  1070. * The visit function will get start and end parameters that cover the overlap
  1071. * between the current vma and the original range. Any unmapped regions in the
  1072. * original range will result in this function returning -ENOMEM while still
  1073. * calling the visit function on all of the existing vmas in the range.
  1074. * Must be called with the mmap_lock held for reading or writing.
  1075. */
  1076. static
  1077. int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
  1078. unsigned long end, unsigned long arg,
  1079. int (*visit)(struct vm_area_struct *vma,
  1080. struct vm_area_struct **prev, unsigned long start,
  1081. unsigned long end, unsigned long arg))
  1082. {
  1083. struct vm_area_struct *vma;
  1084. struct vm_area_struct *prev;
  1085. unsigned long tmp;
  1086. int unmapped_error = 0;
  1087. /*
  1088. * If the interval [start,end) covers some unmapped address
  1089. * ranges, just ignore them, but return -ENOMEM at the end.
  1090. * - different from the way of handling in mlock etc.
  1091. */
  1092. vma = find_vma_prev(mm, start, &prev);
  1093. if (vma && start > vma->vm_start)
  1094. prev = vma;
  1095. for (;;) {
  1096. int error;
  1097. /* Still start < end. */
  1098. if (!vma)
  1099. return -ENOMEM;
  1100. /* Here start < (end|vma->vm_end). */
  1101. if (start < vma->vm_start) {
  1102. unmapped_error = -ENOMEM;
  1103. start = vma->vm_start;
  1104. if (start >= end)
  1105. break;
  1106. }
  1107. /* Here vma->vm_start <= start < (end|vma->vm_end) */
  1108. tmp = vma->vm_end;
  1109. if (end < tmp)
  1110. tmp = end;
  1111. /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  1112. error = visit(vma, &prev, start, tmp, arg);
  1113. if (error)
  1114. return error;
  1115. start = tmp;
  1116. if (prev && start < prev->vm_end)
  1117. start = prev->vm_end;
  1118. if (start >= end)
  1119. break;
  1120. if (prev)
  1121. vma = find_vma(mm, prev->vm_end);
  1122. else /* madvise_remove dropped mmap_lock */
  1123. vma = find_vma(mm, start);
  1124. }
  1125. return unmapped_error;
  1126. }
  1127. #ifdef CONFIG_ANON_VMA_NAME
  1128. static int madvise_vma_anon_name(struct vm_area_struct *vma,
  1129. struct vm_area_struct **prev,
  1130. unsigned long start, unsigned long end,
  1131. unsigned long anon_name)
  1132. {
  1133. int error;
  1134. /* Only anonymous mappings can be named */
  1135. if (vma->vm_file && !vma_is_anon_shmem(vma))
  1136. return -EBADF;
  1137. error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
  1138. (struct anon_vma_name *)anon_name);
  1139. /*
  1140. * madvise() returns EAGAIN if kernel resources, such as
  1141. * slab, are temporarily unavailable.
  1142. */
  1143. if (error == -ENOMEM)
  1144. error = -EAGAIN;
  1145. return error;
  1146. }
  1147. int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  1148. unsigned long len_in, struct anon_vma_name *anon_name)
  1149. {
  1150. unsigned long end;
  1151. unsigned long len;
  1152. if (start & ~PAGE_MASK)
  1153. return -EINVAL;
  1154. len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  1155. /* Check to see whether len was rounded up from small -ve to zero */
  1156. if (len_in && !len)
  1157. return -EINVAL;
  1158. end = start + len;
  1159. if (end < start)
  1160. return -EINVAL;
  1161. if (end == start)
  1162. return 0;
  1163. return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
  1164. madvise_vma_anon_name);
  1165. }
  1166. #endif /* CONFIG_ANON_VMA_NAME */
  1167. /*
  1168. * The madvise(2) system call.
  1169. *
  1170. * Applications can use madvise() to advise the kernel how it should
  1171. * handle paging I/O in this VM area. The idea is to help the kernel
  1172. * use appropriate read-ahead and caching techniques. The information
  1173. * provided is advisory only, and can be safely disregarded by the
  1174. * kernel without affecting the correct operation of the application.
  1175. *
  1176. * behavior values:
  1177. * MADV_NORMAL - the default behavior is to read clusters. This
  1178. * results in some read-ahead and read-behind.
  1179. * MADV_RANDOM - the system should read the minimum amount of data
  1180. * on any access, since it is unlikely that the appli-
  1181. * cation will need more than what it asks for.
  1182. * MADV_SEQUENTIAL - pages in the given range will probably be accessed
  1183. * once, so they can be aggressively read ahead, and
  1184. * can be freed soon after they are accessed.
  1185. * MADV_WILLNEED - the application is notifying the system to read
  1186. * some pages ahead.
  1187. * MADV_DONTNEED - the application is finished with the given range,
  1188. * so the kernel can free resources associated with it.
  1189. * MADV_FREE - the application marks pages in the given range as lazy free,
  1190. * where actual purges are postponed until memory pressure happens.
  1191. * MADV_REMOVE - the application wants to free up the given range of
  1192. * pages and associated backing store.
  1193. * MADV_DONTFORK - omit this area from child's address space when forking:
  1194. * typically, to avoid COWing pages pinned by get_user_pages().
  1195. * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
  1196. * MADV_WIPEONFORK - present the child process with zero-filled memory in this
  1197. * range after a fork.
  1198. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
  1199. * MADV_HWPOISON - trigger memory error handler as if the given memory range
  1200. * were corrupted by unrecoverable hardware memory failure.
  1201. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
  1202. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
  1203. * this area with pages of identical content from other such areas.
  1204. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  1205. * MADV_HUGEPAGE - the application wants to back the given range by transparent
  1206. * huge pages in the future. Existing pages might be coalesced and
  1207. * new pages might be allocated as THP.
  1208. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  1209. * transparent huge pages so the existing pages will not be
  1210. * coalesced into THP and new pages will not be allocated as THP.
  1211. * MADV_COLLAPSE - synchronously coalesce pages into new THP.
  1212. * MADV_DONTDUMP - the application wants to prevent pages in the given range
  1213. * from being included in its core dump.
  1214. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  1215. * MADV_COLD - the application is not expected to use this memory soon,
  1216. * deactivate pages in this range so that they can be reclaimed
  1217. * easily if memory pressure happens.
  1218. * MADV_PAGEOUT - the application is not expected to use this memory soon,
  1219. * page out the pages in this range immediately.
  1220. * MADV_POPULATE_READ - populate (prefault) page tables readable by
  1221. * triggering read faults if required
  1222. * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
  1223. * triggering write faults if required
  1224. *
  1225. * return values:
  1226. * zero - success
  1227. * -EINVAL - start + len < 0, start is not page-aligned,
  1228. * "behavior" is not a valid value, or application
  1229. * is attempting to release locked or shared pages,
  1230. * or the specified address range includes file, Huge TLB,
  1231. * MAP_SHARED or VMPFNMAP range.
  1232. * -ENOMEM - addresses in the specified range are not currently
  1233. * mapped, or are outside the AS of the process.
  1234. * -EIO - an I/O error occurred while paging in data.
  1235. * -EBADF - map exists, but area maps something that isn't a file.
  1236. * -EAGAIN - a kernel resource was temporarily unavailable.
  1237. * -EPERM - memory is sealed.
  1238. */
  1239. int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
  1240. {
  1241. unsigned long end;
  1242. int error;
  1243. int write;
  1244. size_t len;
  1245. struct blk_plug plug;
  1246. if (!madvise_behavior_valid(behavior))
  1247. return -EINVAL;
  1248. if (!PAGE_ALIGNED(start))
  1249. return -EINVAL;
  1250. len = PAGE_ALIGN(len_in);
  1251. /* Check to see whether len was rounded up from small -ve to zero */
  1252. if (len_in && !len)
  1253. return -EINVAL;
  1254. end = start + len;
  1255. if (end < start)
  1256. return -EINVAL;
  1257. if (end == start)
  1258. return 0;
  1259. #ifdef CONFIG_MEMORY_FAILURE
  1260. if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  1261. return madvise_inject_error(behavior, start, start + len_in);
  1262. #endif
  1263. write = madvise_need_mmap_write(behavior);
  1264. if (write) {
  1265. if (mmap_write_lock_killable(mm))
  1266. return -EINTR;
  1267. } else {
  1268. mmap_read_lock(mm);
  1269. }
  1270. start = untagged_addr_remote(mm, start);
  1271. end = start + len;
  1272. blk_start_plug(&plug);
  1273. switch (behavior) {
  1274. case MADV_POPULATE_READ:
  1275. case MADV_POPULATE_WRITE:
  1276. error = madvise_populate(mm, start, end, behavior);
  1277. break;
  1278. default:
  1279. error = madvise_walk_vmas(mm, start, end, behavior,
  1280. madvise_vma_behavior);
  1281. break;
  1282. }
  1283. blk_finish_plug(&plug);
  1284. if (write)
  1285. mmap_write_unlock(mm);
  1286. else
  1287. mmap_read_unlock(mm);
  1288. return error;
  1289. }
  1290. SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  1291. {
  1292. return do_madvise(current->mm, start, len_in, behavior);
  1293. }
  1294. SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  1295. size_t, vlen, int, behavior, unsigned int, flags)
  1296. {
  1297. ssize_t ret;
  1298. struct iovec iovstack[UIO_FASTIOV];
  1299. struct iovec *iov = iovstack;
  1300. struct iov_iter iter;
  1301. struct task_struct *task;
  1302. struct mm_struct *mm;
  1303. size_t total_len;
  1304. unsigned int f_flags;
  1305. if (flags != 0) {
  1306. ret = -EINVAL;
  1307. goto out;
  1308. }
  1309. ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
  1310. if (ret < 0)
  1311. goto out;
  1312. task = pidfd_get_task(pidfd, &f_flags);
  1313. if (IS_ERR(task)) {
  1314. ret = PTR_ERR(task);
  1315. goto free_iov;
  1316. }
  1317. if (!process_madvise_behavior_valid(behavior)) {
  1318. ret = -EINVAL;
  1319. goto release_task;
  1320. }
  1321. /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
  1322. mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
  1323. if (IS_ERR_OR_NULL(mm)) {
  1324. ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
  1325. goto release_task;
  1326. }
  1327. /*
  1328. * Require CAP_SYS_NICE for influencing process performance. Note that
  1329. * only non-destructive hints are currently supported.
  1330. */
  1331. if (mm != current->mm && !capable(CAP_SYS_NICE)) {
  1332. ret = -EPERM;
  1333. goto release_mm;
  1334. }
  1335. total_len = iov_iter_count(&iter);
  1336. while (iov_iter_count(&iter)) {
  1337. ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
  1338. iter_iov_len(&iter), behavior);
  1339. if (ret < 0)
  1340. break;
  1341. iov_iter_advance(&iter, iter_iov_len(&iter));
  1342. }
  1343. ret = (total_len - iov_iter_count(&iter)) ? : ret;
  1344. release_mm:
  1345. mmput(mm);
  1346. release_task:
  1347. put_task_struct(task);
  1348. free_iov:
  1349. kfree(iov);
  1350. out:
  1351. return ret;
  1352. }