madvise.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/madvise.c
  4. *
  5. * Copyright (C) 1999 Linus Torvalds
  6. * Copyright (C) 2002 Christoph Hellwig
  7. */
  8. #include <linux/mman.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/syscalls.h>
  11. #include <linux/mempolicy.h>
  12. #include <linux/page-isolation.h>
  13. #include <linux/page_idle.h>
  14. #include <linux/userfaultfd_k.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/falloc.h>
  17. #include <linux/fadvise.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/mm.h>
  20. #include <linux/mm_inline.h>
  21. #include <linux/string.h>
  22. #include <linux/uio.h>
  23. #include <linux/ksm.h>
  24. #include <linux/fs.h>
  25. #include <linux/file.h>
  26. #include <linux/blkdev.h>
  27. #include <linux/backing-dev.h>
  28. #include <linux/pagewalk.h>
  29. #include <linux/swap.h>
  30. #include <linux/swapops.h>
  31. #include <linux/shmem_fs.h>
  32. #include <linux/mmu_notifier.h>
  33. #include <asm/tlb.h>
  34. #include "internal.h"
  35. #include "swap.h"
  36. struct madvise_walk_private {
  37. struct mmu_gather *tlb;
  38. bool pageout;
  39. };
  40. /*
  41. * Any behaviour which results in changes to the vma->vm_flags needs to
  42. * take mmap_lock for writing. Others, which simply traverse vmas, need
  43. * to only take it for reading.
  44. */
  45. static int madvise_need_mmap_write(int behavior)
  46. {
  47. switch (behavior) {
  48. case MADV_REMOVE:
  49. case MADV_WILLNEED:
  50. case MADV_DONTNEED:
  51. case MADV_DONTNEED_LOCKED:
  52. case MADV_COLD:
  53. case MADV_PAGEOUT:
  54. case MADV_FREE:
  55. case MADV_POPULATE_READ:
  56. case MADV_POPULATE_WRITE:
  57. case MADV_COLLAPSE:
  58. return 0;
  59. default:
  60. /* be safe, default to 1. list exceptions explicitly */
  61. return 1;
  62. }
  63. }
  64. #ifdef CONFIG_ANON_VMA_NAME
  65. struct anon_vma_name *anon_vma_name_alloc(const char *name)
  66. {
  67. struct anon_vma_name *anon_name;
  68. size_t count;
  69. /* Add 1 for NUL terminator at the end of the anon_name->name */
  70. count = strlen(name) + 1;
  71. anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
  72. if (anon_name) {
  73. kref_init(&anon_name->kref);
  74. memcpy(anon_name->name, name, count);
  75. }
  76. return anon_name;
  77. }
  78. void anon_vma_name_free(struct kref *kref)
  79. {
  80. struct anon_vma_name *anon_name =
  81. container_of(kref, struct anon_vma_name, kref);
  82. kfree(anon_name);
  83. }
  84. struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  85. {
  86. mmap_assert_locked(vma->vm_mm);
  87. return vma->anon_name;
  88. }
  89. /* mmap_lock should be write-locked */
  90. static int replace_anon_vma_name(struct vm_area_struct *vma,
  91. struct anon_vma_name *anon_name)
  92. {
  93. struct anon_vma_name *orig_name = anon_vma_name(vma);
  94. if (!anon_name) {
  95. vma->anon_name = NULL;
  96. anon_vma_name_put(orig_name);
  97. return 0;
  98. }
  99. if (anon_vma_name_eq(orig_name, anon_name))
  100. return 0;
  101. vma->anon_name = anon_vma_name_reuse(anon_name);
  102. anon_vma_name_put(orig_name);
  103. return 0;
  104. }
  105. #else /* CONFIG_ANON_VMA_NAME */
  106. static int replace_anon_vma_name(struct vm_area_struct *vma,
  107. struct anon_vma_name *anon_name)
  108. {
  109. if (anon_name)
  110. return -EINVAL;
  111. return 0;
  112. }
  113. #endif /* CONFIG_ANON_VMA_NAME */
  114. /*
  115. * Update the vm_flags on region of a vma, splitting it or merging it as
  116. * necessary. Must be called with mmap_lock held for writing;
  117. * Caller should ensure anon_name stability by raising its refcount even when
  118. * anon_name belongs to a valid vma because this function might free that vma.
  119. */
  120. static int madvise_update_vma(struct vm_area_struct *vma,
  121. struct vm_area_struct **prev, unsigned long start,
  122. unsigned long end, unsigned long new_flags,
  123. struct anon_vma_name *anon_name)
  124. {
  125. struct mm_struct *mm = vma->vm_mm;
  126. int error;
  127. VMA_ITERATOR(vmi, mm, start);
  128. if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
  129. *prev = vma;
  130. return 0;
  131. }
  132. vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
  133. anon_name);
  134. if (IS_ERR(vma))
  135. return PTR_ERR(vma);
  136. *prev = vma;
  137. /* vm_flags is protected by the mmap_lock held in write mode. */
  138. vma_start_write(vma);
  139. vm_flags_reset(vma, new_flags);
  140. if (!vma->vm_file || vma_is_anon_shmem(vma)) {
  141. error = replace_anon_vma_name(vma, anon_name);
  142. if (error)
  143. return error;
  144. }
  145. return 0;
  146. }
  147. #ifdef CONFIG_SWAP
  148. static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  149. unsigned long end, struct mm_walk *walk)
  150. {
  151. struct vm_area_struct *vma = walk->private;
  152. struct swap_iocb *splug = NULL;
  153. pte_t *ptep = NULL;
  154. spinlock_t *ptl;
  155. unsigned long addr;
  156. for (addr = start; addr < end; addr += PAGE_SIZE) {
  157. pte_t pte;
  158. swp_entry_t entry;
  159. struct folio *folio;
  160. if (!ptep++) {
  161. ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  162. if (!ptep)
  163. break;
  164. }
  165. pte = ptep_get(ptep);
  166. if (!is_swap_pte(pte))
  167. continue;
  168. entry = pte_to_swp_entry(pte);
  169. if (unlikely(non_swap_entry(entry)))
  170. continue;
  171. pte_unmap_unlock(ptep, ptl);
  172. ptep = NULL;
  173. folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  174. vma, addr, &splug);
  175. if (folio)
  176. folio_put(folio);
  177. }
  178. if (ptep)
  179. pte_unmap_unlock(ptep, ptl);
  180. swap_read_unplug(splug);
  181. cond_resched();
  182. return 0;
  183. }
  184. static const struct mm_walk_ops swapin_walk_ops = {
  185. .pmd_entry = swapin_walk_pmd_entry,
  186. .walk_lock = PGWALK_RDLOCK,
  187. };
  188. static void shmem_swapin_range(struct vm_area_struct *vma,
  189. unsigned long start, unsigned long end,
  190. struct address_space *mapping)
  191. {
  192. XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
  193. pgoff_t end_index = linear_page_index(vma, end) - 1;
  194. struct folio *folio;
  195. struct swap_iocb *splug = NULL;
  196. rcu_read_lock();
  197. xas_for_each(&xas, folio, end_index) {
  198. unsigned long addr;
  199. swp_entry_t entry;
  200. if (!xa_is_value(folio))
  201. continue;
  202. entry = radix_to_swp_entry(folio);
  203. /* There might be swapin error entries in shmem mapping. */
  204. if (non_swap_entry(entry))
  205. continue;
  206. addr = vma->vm_start +
  207. ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
  208. xas_pause(&xas);
  209. rcu_read_unlock();
  210. folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
  211. vma, addr, &splug);
  212. if (folio)
  213. folio_put(folio);
  214. rcu_read_lock();
  215. }
  216. rcu_read_unlock();
  217. swap_read_unplug(splug);
  218. }
  219. #endif /* CONFIG_SWAP */
  220. /*
  221. * Schedule all required I/O operations. Do not wait for completion.
  222. */
  223. static long madvise_willneed(struct vm_area_struct *vma,
  224. struct vm_area_struct **prev,
  225. unsigned long start, unsigned long end)
  226. {
  227. struct mm_struct *mm = vma->vm_mm;
  228. struct file *file = vma->vm_file;
  229. loff_t offset;
  230. *prev = vma;
  231. #ifdef CONFIG_SWAP
  232. if (!file) {
  233. walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  234. lru_add_drain(); /* Push any new pages onto the LRU now */
  235. return 0;
  236. }
  237. if (shmem_mapping(file->f_mapping)) {
  238. shmem_swapin_range(vma, start, end, file->f_mapping);
  239. lru_add_drain(); /* Push any new pages onto the LRU now */
  240. return 0;
  241. }
  242. #else
  243. if (!file)
  244. return -EBADF;
  245. #endif
  246. if (IS_DAX(file_inode(file))) {
  247. /* no bad return value, but ignore advice */
  248. return 0;
  249. }
  250. /*
  251. * Filesystem's fadvise may need to take various locks. We need to
  252. * explicitly grab a reference because the vma (and hence the
  253. * vma's reference to the file) can go away as soon as we drop
  254. * mmap_lock.
  255. */
  256. *prev = NULL; /* tell sys_madvise we drop mmap_lock */
  257. get_file(file);
  258. offset = (loff_t)(start - vma->vm_start)
  259. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  260. mmap_read_unlock(mm);
  261. vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  262. fput(file);
  263. mmap_read_lock(mm);
  264. return 0;
  265. }
  266. static inline bool can_do_file_pageout(struct vm_area_struct *vma)
  267. {
  268. if (!vma->vm_file)
  269. return false;
  270. /*
  271. * paging out pagecache only for non-anonymous mappings that correspond
  272. * to the files the calling process could (if tried) open for writing;
  273. * otherwise we'd be including shared non-exclusive mappings, which
  274. * opens a side channel.
  275. */
  276. return inode_owner_or_capable(&nop_mnt_idmap,
  277. file_inode(vma->vm_file)) ||
  278. file_permission(vma->vm_file, MAY_WRITE) == 0;
  279. }
  280. static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
  281. struct folio *folio, pte_t *ptep,
  282. pte_t pte, bool *any_young,
  283. bool *any_dirty)
  284. {
  285. const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
  286. int max_nr = (end - addr) / PAGE_SIZE;
  287. return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
  288. any_young, any_dirty);
  289. }
  290. static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  291. unsigned long addr, unsigned long end,
  292. struct mm_walk *walk)
  293. {
  294. struct madvise_walk_private *private = walk->private;
  295. struct mmu_gather *tlb = private->tlb;
  296. bool pageout = private->pageout;
  297. struct mm_struct *mm = tlb->mm;
  298. struct vm_area_struct *vma = walk->vma;
  299. pte_t *start_pte, *pte, ptent;
  300. spinlock_t *ptl;
  301. struct folio *folio = NULL;
  302. LIST_HEAD(folio_list);
  303. bool pageout_anon_only_filter;
  304. unsigned int batch_count = 0;
  305. int nr;
  306. if (fatal_signal_pending(current))
  307. return -EINTR;
  308. pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
  309. !can_do_file_pageout(vma);
  310. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  311. if (pmd_trans_huge(*pmd)) {
  312. pmd_t orig_pmd;
  313. unsigned long next = pmd_addr_end(addr, end);
  314. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  315. ptl = pmd_trans_huge_lock(pmd, vma);
  316. if (!ptl)
  317. return 0;
  318. orig_pmd = *pmd;
  319. if (is_huge_zero_pmd(orig_pmd))
  320. goto huge_unlock;
  321. if (unlikely(!pmd_present(orig_pmd))) {
  322. VM_BUG_ON(thp_migration_supported() &&
  323. !is_pmd_migration_entry(orig_pmd));
  324. goto huge_unlock;
  325. }
  326. folio = pmd_folio(orig_pmd);
  327. /* Do not interfere with other mappings of this folio */
  328. if (folio_likely_mapped_shared(folio))
  329. goto huge_unlock;
  330. if (pageout_anon_only_filter && !folio_test_anon(folio))
  331. goto huge_unlock;
  332. if (next - addr != HPAGE_PMD_SIZE) {
  333. int err;
  334. folio_get(folio);
  335. spin_unlock(ptl);
  336. folio_lock(folio);
  337. err = split_folio(folio);
  338. folio_unlock(folio);
  339. folio_put(folio);
  340. if (!err)
  341. goto regular_folio;
  342. return 0;
  343. }
  344. if (!pageout && pmd_young(orig_pmd)) {
  345. pmdp_invalidate(vma, addr, pmd);
  346. orig_pmd = pmd_mkold(orig_pmd);
  347. set_pmd_at(mm, addr, pmd, orig_pmd);
  348. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  349. }
  350. folio_clear_referenced(folio);
  351. folio_test_clear_young(folio);
  352. if (folio_test_active(folio))
  353. folio_set_workingset(folio);
  354. if (pageout) {
  355. if (folio_isolate_lru(folio)) {
  356. if (folio_test_unevictable(folio))
  357. folio_putback_lru(folio);
  358. else
  359. list_add(&folio->lru, &folio_list);
  360. }
  361. } else
  362. folio_deactivate(folio);
  363. huge_unlock:
  364. spin_unlock(ptl);
  365. if (pageout)
  366. reclaim_pages(&folio_list);
  367. return 0;
  368. }
  369. regular_folio:
  370. #endif
  371. tlb_change_page_size(tlb, PAGE_SIZE);
  372. restart:
  373. start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  374. if (!start_pte)
  375. return 0;
  376. flush_tlb_batched_pending(mm);
  377. arch_enter_lazy_mmu_mode();
  378. for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
  379. nr = 1;
  380. ptent = ptep_get(pte);
  381. if (++batch_count == SWAP_CLUSTER_MAX) {
  382. batch_count = 0;
  383. if (need_resched()) {
  384. arch_leave_lazy_mmu_mode();
  385. pte_unmap_unlock(start_pte, ptl);
  386. cond_resched();
  387. goto restart;
  388. }
  389. }
  390. if (pte_none(ptent))
  391. continue;
  392. if (!pte_present(ptent))
  393. continue;
  394. folio = vm_normal_folio(vma, addr, ptent);
  395. if (!folio || folio_is_zone_device(folio))
  396. continue;
  397. /*
  398. * If we encounter a large folio, only split it if it is not
  399. * fully mapped within the range we are operating on. Otherwise
  400. * leave it as is so that it can be swapped out whole. If we
  401. * fail to split a folio, leave it in place and advance to the
  402. * next pte in the range.
  403. */
  404. if (folio_test_large(folio)) {
  405. bool any_young;
  406. nr = madvise_folio_pte_batch(addr, end, folio, pte,
  407. ptent, &any_young, NULL);
  408. if (any_young)
  409. ptent = pte_mkyoung(ptent);
  410. if (nr < folio_nr_pages(folio)) {
  411. int err;
  412. if (folio_likely_mapped_shared(folio))
  413. continue;
  414. if (pageout_anon_only_filter && !folio_test_anon(folio))
  415. continue;
  416. if (!folio_trylock(folio))
  417. continue;
  418. folio_get(folio);
  419. arch_leave_lazy_mmu_mode();
  420. pte_unmap_unlock(start_pte, ptl);
  421. start_pte = NULL;
  422. err = split_folio(folio);
  423. folio_unlock(folio);
  424. folio_put(folio);
  425. start_pte = pte =
  426. pte_offset_map_lock(mm, pmd, addr, &ptl);
  427. if (!start_pte)
  428. break;
  429. flush_tlb_batched_pending(mm);
  430. arch_enter_lazy_mmu_mode();
  431. if (!err)
  432. nr = 0;
  433. continue;
  434. }
  435. }
  436. /*
  437. * Do not interfere with other mappings of this folio and
  438. * non-LRU folio. If we have a large folio at this point, we
  439. * know it is fully mapped so if its mapcount is the same as its
  440. * number of pages, it must be exclusive.
  441. */
  442. if (!folio_test_lru(folio) ||
  443. folio_mapcount(folio) != folio_nr_pages(folio))
  444. continue;
  445. if (pageout_anon_only_filter && !folio_test_anon(folio))
  446. continue;
  447. if (!pageout && pte_young(ptent)) {
  448. clear_young_dirty_ptes(vma, addr, pte, nr,
  449. CYDP_CLEAR_YOUNG);
  450. tlb_remove_tlb_entries(tlb, pte, nr, addr);
  451. }
  452. /*
  453. * We are deactivating a folio for accelerating reclaiming.
  454. * VM couldn't reclaim the folio unless we clear PG_young.
  455. * As a side effect, it makes confuse idle-page tracking
  456. * because they will miss recent referenced history.
  457. */
  458. folio_clear_referenced(folio);
  459. folio_test_clear_young(folio);
  460. if (folio_test_active(folio))
  461. folio_set_workingset(folio);
  462. if (pageout) {
  463. if (folio_isolate_lru(folio)) {
  464. if (folio_test_unevictable(folio))
  465. folio_putback_lru(folio);
  466. else
  467. list_add(&folio->lru, &folio_list);
  468. }
  469. } else
  470. folio_deactivate(folio);
  471. }
  472. if (start_pte) {
  473. arch_leave_lazy_mmu_mode();
  474. pte_unmap_unlock(start_pte, ptl);
  475. }
  476. if (pageout)
  477. reclaim_pages(&folio_list);
  478. cond_resched();
  479. return 0;
  480. }
  481. static const struct mm_walk_ops cold_walk_ops = {
  482. .pmd_entry = madvise_cold_or_pageout_pte_range,
  483. .walk_lock = PGWALK_RDLOCK,
  484. };
  485. static void madvise_cold_page_range(struct mmu_gather *tlb,
  486. struct vm_area_struct *vma,
  487. unsigned long addr, unsigned long end)
  488. {
  489. struct madvise_walk_private walk_private = {
  490. .pageout = false,
  491. .tlb = tlb,
  492. };
  493. tlb_start_vma(tlb, vma);
  494. walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
  495. tlb_end_vma(tlb, vma);
  496. }
  497. static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
  498. {
  499. return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
  500. }
  501. static long madvise_cold(struct vm_area_struct *vma,
  502. struct vm_area_struct **prev,
  503. unsigned long start_addr, unsigned long end_addr)
  504. {
  505. struct mm_struct *mm = vma->vm_mm;
  506. struct mmu_gather tlb;
  507. *prev = vma;
  508. if (!can_madv_lru_vma(vma))
  509. return -EINVAL;
  510. lru_add_drain();
  511. tlb_gather_mmu(&tlb, mm);
  512. madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
  513. tlb_finish_mmu(&tlb);
  514. return 0;
  515. }
  516. static void madvise_pageout_page_range(struct mmu_gather *tlb,
  517. struct vm_area_struct *vma,
  518. unsigned long addr, unsigned long end)
  519. {
  520. struct madvise_walk_private walk_private = {
  521. .pageout = true,
  522. .tlb = tlb,
  523. };
  524. tlb_start_vma(tlb, vma);
  525. walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
  526. tlb_end_vma(tlb, vma);
  527. }
  528. static long madvise_pageout(struct vm_area_struct *vma,
  529. struct vm_area_struct **prev,
  530. unsigned long start_addr, unsigned long end_addr)
  531. {
  532. struct mm_struct *mm = vma->vm_mm;
  533. struct mmu_gather tlb;
  534. *prev = vma;
  535. if (!can_madv_lru_vma(vma))
  536. return -EINVAL;
  537. /*
  538. * If the VMA belongs to a private file mapping, there can be private
  539. * dirty pages which can be paged out if even this process is neither
  540. * owner nor write capable of the file. We allow private file mappings
  541. * further to pageout dirty anon pages.
  542. */
  543. if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
  544. (vma->vm_flags & VM_MAYSHARE)))
  545. return 0;
  546. lru_add_drain();
  547. tlb_gather_mmu(&tlb, mm);
  548. madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
  549. tlb_finish_mmu(&tlb);
  550. return 0;
  551. }
  552. static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  553. unsigned long end, struct mm_walk *walk)
  554. {
  555. const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
  556. struct mmu_gather *tlb = walk->private;
  557. struct mm_struct *mm = tlb->mm;
  558. struct vm_area_struct *vma = walk->vma;
  559. spinlock_t *ptl;
  560. pte_t *start_pte, *pte, ptent;
  561. struct folio *folio;
  562. int nr_swap = 0;
  563. unsigned long next;
  564. int nr, max_nr;
  565. next = pmd_addr_end(addr, end);
  566. if (pmd_trans_huge(*pmd))
  567. if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  568. return 0;
  569. tlb_change_page_size(tlb, PAGE_SIZE);
  570. start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  571. if (!start_pte)
  572. return 0;
  573. flush_tlb_batched_pending(mm);
  574. arch_enter_lazy_mmu_mode();
  575. for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
  576. nr = 1;
  577. ptent = ptep_get(pte);
  578. if (pte_none(ptent))
  579. continue;
  580. /*
  581. * If the pte has swp_entry, just clear page table to
  582. * prevent swap-in which is more expensive rather than
  583. * (page allocation + zeroing).
  584. */
  585. if (!pte_present(ptent)) {
  586. swp_entry_t entry;
  587. entry = pte_to_swp_entry(ptent);
  588. if (!non_swap_entry(entry)) {
  589. max_nr = (end - addr) / PAGE_SIZE;
  590. nr = swap_pte_batch(pte, max_nr, ptent);
  591. nr_swap -= nr;
  592. free_swap_and_cache_nr(entry, nr);
  593. clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
  594. } else if (is_hwpoison_entry(entry) ||
  595. is_poisoned_swp_entry(entry)) {
  596. pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  597. }
  598. continue;
  599. }
  600. folio = vm_normal_folio(vma, addr, ptent);
  601. if (!folio || folio_is_zone_device(folio))
  602. continue;
  603. /*
  604. * If we encounter a large folio, only split it if it is not
  605. * fully mapped within the range we are operating on. Otherwise
  606. * leave it as is so that it can be marked as lazyfree. If we
  607. * fail to split a folio, leave it in place and advance to the
  608. * next pte in the range.
  609. */
  610. if (folio_test_large(folio)) {
  611. bool any_young, any_dirty;
  612. nr = madvise_folio_pte_batch(addr, end, folio, pte,
  613. ptent, &any_young, &any_dirty);
  614. if (nr < folio_nr_pages(folio)) {
  615. int err;
  616. if (folio_likely_mapped_shared(folio))
  617. continue;
  618. if (!folio_trylock(folio))
  619. continue;
  620. folio_get(folio);
  621. arch_leave_lazy_mmu_mode();
  622. pte_unmap_unlock(start_pte, ptl);
  623. start_pte = NULL;
  624. err = split_folio(folio);
  625. folio_unlock(folio);
  626. folio_put(folio);
  627. pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  628. start_pte = pte;
  629. if (!start_pte)
  630. break;
  631. flush_tlb_batched_pending(mm);
  632. arch_enter_lazy_mmu_mode();
  633. if (!err)
  634. nr = 0;
  635. continue;
  636. }
  637. if (any_young)
  638. ptent = pte_mkyoung(ptent);
  639. if (any_dirty)
  640. ptent = pte_mkdirty(ptent);
  641. }
  642. if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
  643. if (!folio_trylock(folio))
  644. continue;
  645. /*
  646. * If we have a large folio at this point, we know it is
  647. * fully mapped so if its mapcount is the same as its
  648. * number of pages, it must be exclusive.
  649. */
  650. if (folio_mapcount(folio) != folio_nr_pages(folio)) {
  651. folio_unlock(folio);
  652. continue;
  653. }
  654. if (folio_test_swapcache(folio) &&
  655. !folio_free_swap(folio)) {
  656. folio_unlock(folio);
  657. continue;
  658. }
  659. folio_clear_dirty(folio);
  660. folio_unlock(folio);
  661. }
  662. if (pte_young(ptent) || pte_dirty(ptent)) {
  663. clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
  664. tlb_remove_tlb_entries(tlb, pte, nr, addr);
  665. }
  666. folio_mark_lazyfree(folio);
  667. }
  668. if (nr_swap)
  669. add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  670. if (start_pte) {
  671. arch_leave_lazy_mmu_mode();
  672. pte_unmap_unlock(start_pte, ptl);
  673. }
  674. cond_resched();
  675. return 0;
  676. }
  677. static const struct mm_walk_ops madvise_free_walk_ops = {
  678. .pmd_entry = madvise_free_pte_range,
  679. .walk_lock = PGWALK_RDLOCK,
  680. };
  681. static int madvise_free_single_vma(struct vm_area_struct *vma,
  682. unsigned long start_addr, unsigned long end_addr)
  683. {
  684. struct mm_struct *mm = vma->vm_mm;
  685. struct mmu_notifier_range range;
  686. struct mmu_gather tlb;
  687. /* MADV_FREE works for only anon vma at the moment */
  688. if (!vma_is_anonymous(vma))
  689. return -EINVAL;
  690. range.start = max(vma->vm_start, start_addr);
  691. if (range.start >= vma->vm_end)
  692. return -EINVAL;
  693. range.end = min(vma->vm_end, end_addr);
  694. if (range.end <= vma->vm_start)
  695. return -EINVAL;
  696. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
  697. range.start, range.end);
  698. lru_add_drain();
  699. tlb_gather_mmu(&tlb, mm);
  700. update_hiwater_rss(mm);
  701. mmu_notifier_invalidate_range_start(&range);
  702. tlb_start_vma(&tlb, vma);
  703. walk_page_range(vma->vm_mm, range.start, range.end,
  704. &madvise_free_walk_ops, &tlb);
  705. tlb_end_vma(&tlb, vma);
  706. mmu_notifier_invalidate_range_end(&range);
  707. tlb_finish_mmu(&tlb);
  708. return 0;
  709. }
  710. /*
  711. * Application no longer needs these pages. If the pages are dirty,
  712. * it's OK to just throw them away. The app will be more careful about
  713. * data it wants to keep. Be sure to free swap resources too. The
  714. * zap_page_range_single call sets things up for shrink_active_list to actually
  715. * free these pages later if no one else has touched them in the meantime,
  716. * although we could add these pages to a global reuse list for
  717. * shrink_active_list to pick up before reclaiming other pages.
  718. *
  719. * NB: This interface discards data rather than pushes it out to swap,
  720. * as some implementations do. This has performance implications for
  721. * applications like large transactional databases which want to discard
  722. * pages in anonymous maps after committing to backing store the data
  723. * that was kept in them. There is no reason to write this data out to
  724. * the swap area if the application is discarding it.
  725. *
  726. * An interface that causes the system to free clean pages and flush
  727. * dirty pages is already available as msync(MS_INVALIDATE).
  728. */
  729. static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  730. unsigned long start, unsigned long end)
  731. {
  732. zap_page_range_single(vma, start, end - start, NULL);
  733. return 0;
  734. }
  735. static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
  736. unsigned long start,
  737. unsigned long *end,
  738. int behavior)
  739. {
  740. if (!is_vm_hugetlb_page(vma)) {
  741. unsigned int forbidden = VM_PFNMAP;
  742. if (behavior != MADV_DONTNEED_LOCKED)
  743. forbidden |= VM_LOCKED;
  744. return !(vma->vm_flags & forbidden);
  745. }
  746. if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
  747. return false;
  748. if (start & ~huge_page_mask(hstate_vma(vma)))
  749. return false;
  750. /*
  751. * Madvise callers expect the length to be rounded up to PAGE_SIZE
  752. * boundaries, and may be unaware that this VMA uses huge pages.
  753. * Avoid unexpected data loss by rounding down the number of
  754. * huge pages freed.
  755. */
  756. *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
  757. return true;
  758. }
  759. static long madvise_dontneed_free(struct vm_area_struct *vma,
  760. struct vm_area_struct **prev,
  761. unsigned long start, unsigned long end,
  762. int behavior)
  763. {
  764. struct mm_struct *mm = vma->vm_mm;
  765. *prev = vma;
  766. if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
  767. return -EINVAL;
  768. if (start == end)
  769. return 0;
  770. if (!userfaultfd_remove(vma, start, end)) {
  771. *prev = NULL; /* mmap_lock has been dropped, prev is stale */
  772. mmap_read_lock(mm);
  773. vma = vma_lookup(mm, start);
  774. if (!vma)
  775. return -ENOMEM;
  776. /*
  777. * Potential end adjustment for hugetlb vma is OK as
  778. * the check below keeps end within vma.
  779. */
  780. if (!madvise_dontneed_free_valid_vma(vma, start, &end,
  781. behavior))
  782. return -EINVAL;
  783. if (end > vma->vm_end) {
  784. /*
  785. * Don't fail if end > vma->vm_end. If the old
  786. * vma was split while the mmap_lock was
  787. * released the effect of the concurrent
  788. * operation may not cause madvise() to
  789. * have an undefined result. There may be an
  790. * adjacent next vma that we'll walk
  791. * next. userfaultfd_remove() will generate an
  792. * UFFD_EVENT_REMOVE repetition on the
  793. * end-vma->vm_end range, but the manager can
  794. * handle a repetition fine.
  795. */
  796. end = vma->vm_end;
  797. }
  798. /*
  799. * If the memory region between start and end was
  800. * originally backed by 4kB pages and then remapped to
  801. * be backed by hugepages while mmap_lock was dropped,
  802. * the adjustment for hugetlb vma above may have rounded
  803. * end down to the start address.
  804. */
  805. if (start == end)
  806. return 0;
  807. VM_WARN_ON(start > end);
  808. }
  809. if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
  810. return madvise_dontneed_single_vma(vma, start, end);
  811. else if (behavior == MADV_FREE)
  812. return madvise_free_single_vma(vma, start, end);
  813. else
  814. return -EINVAL;
  815. }
  816. static long madvise_populate(struct mm_struct *mm, unsigned long start,
  817. unsigned long end, int behavior)
  818. {
  819. const bool write = behavior == MADV_POPULATE_WRITE;
  820. int locked = 1;
  821. long pages;
  822. while (start < end) {
  823. /* Populate (prefault) page tables readable/writable. */
  824. pages = faultin_page_range(mm, start, end, write, &locked);
  825. if (!locked) {
  826. mmap_read_lock(mm);
  827. locked = 1;
  828. }
  829. if (pages < 0) {
  830. switch (pages) {
  831. case -EINTR:
  832. return -EINTR;
  833. case -EINVAL: /* Incompatible mappings / permissions. */
  834. return -EINVAL;
  835. case -EHWPOISON:
  836. return -EHWPOISON;
  837. case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
  838. return -EFAULT;
  839. default:
  840. pr_warn_once("%s: unhandled return value: %ld\n",
  841. __func__, pages);
  842. fallthrough;
  843. case -ENOMEM: /* No VMA or out of memory. */
  844. return -ENOMEM;
  845. }
  846. }
  847. start += pages * PAGE_SIZE;
  848. }
  849. return 0;
  850. }
  851. /*
  852. * Application wants to free up the pages and associated backing store.
  853. * This is effectively punching a hole into the middle of a file.
  854. */
  855. static long madvise_remove(struct vm_area_struct *vma,
  856. struct vm_area_struct **prev,
  857. unsigned long start, unsigned long end)
  858. {
  859. loff_t offset;
  860. int error;
  861. struct file *f;
  862. struct mm_struct *mm = vma->vm_mm;
  863. *prev = NULL; /* tell sys_madvise we drop mmap_lock */
  864. if (vma->vm_flags & VM_LOCKED)
  865. return -EINVAL;
  866. f = vma->vm_file;
  867. if (!f || !f->f_mapping || !f->f_mapping->host) {
  868. return -EINVAL;
  869. }
  870. if (!vma_is_shared_maywrite(vma))
  871. return -EACCES;
  872. offset = (loff_t)(start - vma->vm_start)
  873. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  874. /*
  875. * Filesystem's fallocate may need to take i_rwsem. We need to
  876. * explicitly grab a reference because the vma (and hence the
  877. * vma's reference to the file) can go away as soon as we drop
  878. * mmap_lock.
  879. */
  880. get_file(f);
  881. if (userfaultfd_remove(vma, start, end)) {
  882. /* mmap_lock was not released by userfaultfd_remove() */
  883. mmap_read_unlock(mm);
  884. }
  885. error = vfs_fallocate(f,
  886. FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  887. offset, end - start);
  888. fput(f);
  889. mmap_read_lock(mm);
  890. return error;
  891. }
  892. /*
  893. * Apply an madvise behavior to a region of a vma. madvise_update_vma
  894. * will handle splitting a vm area into separate areas, each area with its own
  895. * behavior.
  896. */
  897. static int madvise_vma_behavior(struct vm_area_struct *vma,
  898. struct vm_area_struct **prev,
  899. unsigned long start, unsigned long end,
  900. unsigned long behavior)
  901. {
  902. int error;
  903. struct anon_vma_name *anon_name;
  904. unsigned long new_flags = vma->vm_flags;
  905. if (unlikely(!can_modify_vma_madv(vma, behavior)))
  906. return -EPERM;
  907. switch (behavior) {
  908. case MADV_REMOVE:
  909. return madvise_remove(vma, prev, start, end);
  910. case MADV_WILLNEED:
  911. return madvise_willneed(vma, prev, start, end);
  912. case MADV_COLD:
  913. return madvise_cold(vma, prev, start, end);
  914. case MADV_PAGEOUT:
  915. return madvise_pageout(vma, prev, start, end);
  916. case MADV_FREE:
  917. case MADV_DONTNEED:
  918. case MADV_DONTNEED_LOCKED:
  919. return madvise_dontneed_free(vma, prev, start, end, behavior);
  920. case MADV_NORMAL:
  921. new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  922. break;
  923. case MADV_SEQUENTIAL:
  924. new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  925. break;
  926. case MADV_RANDOM:
  927. new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  928. break;
  929. case MADV_DONTFORK:
  930. new_flags |= VM_DONTCOPY;
  931. break;
  932. case MADV_DOFORK:
  933. if (vma->vm_flags & VM_IO)
  934. return -EINVAL;
  935. new_flags &= ~VM_DONTCOPY;
  936. break;
  937. case MADV_WIPEONFORK:
  938. /* MADV_WIPEONFORK is only supported on anonymous memory. */
  939. if (vma->vm_file || vma->vm_flags & VM_SHARED)
  940. return -EINVAL;
  941. new_flags |= VM_WIPEONFORK;
  942. break;
  943. case MADV_KEEPONFORK:
  944. if (vma->vm_flags & VM_DROPPABLE)
  945. return -EINVAL;
  946. new_flags &= ~VM_WIPEONFORK;
  947. break;
  948. case MADV_DONTDUMP:
  949. new_flags |= VM_DONTDUMP;
  950. break;
  951. case MADV_DODUMP:
  952. if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
  953. (vma->vm_flags & VM_DROPPABLE))
  954. return -EINVAL;
  955. new_flags &= ~VM_DONTDUMP;
  956. break;
  957. case MADV_MERGEABLE:
  958. case MADV_UNMERGEABLE:
  959. error = ksm_madvise(vma, start, end, behavior, &new_flags);
  960. if (error)
  961. goto out;
  962. break;
  963. case MADV_HUGEPAGE:
  964. case MADV_NOHUGEPAGE:
  965. error = hugepage_madvise(vma, &new_flags, behavior);
  966. if (error)
  967. goto out;
  968. break;
  969. case MADV_COLLAPSE:
  970. return madvise_collapse(vma, prev, start, end);
  971. }
  972. anon_name = anon_vma_name(vma);
  973. anon_vma_name_get(anon_name);
  974. error = madvise_update_vma(vma, prev, start, end, new_flags,
  975. anon_name);
  976. anon_vma_name_put(anon_name);
  977. out:
  978. /*
  979. * madvise() returns EAGAIN if kernel resources, such as
  980. * slab, are temporarily unavailable.
  981. */
  982. if (error == -ENOMEM)
  983. error = -EAGAIN;
  984. return error;
  985. }
  986. #ifdef CONFIG_MEMORY_FAILURE
  987. /*
  988. * Error injection support for memory error handling.
  989. */
  990. static int madvise_inject_error(int behavior,
  991. unsigned long start, unsigned long end)
  992. {
  993. unsigned long size;
  994. if (!capable(CAP_SYS_ADMIN))
  995. return -EPERM;
  996. for (; start < end; start += size) {
  997. unsigned long pfn;
  998. struct page *page;
  999. int ret;
  1000. ret = get_user_pages_fast(start, 1, 0, &page);
  1001. if (ret != 1)
  1002. return ret;
  1003. pfn = page_to_pfn(page);
  1004. /*
  1005. * When soft offlining hugepages, after migrating the page
  1006. * we dissolve it, therefore in the second loop "page" will
  1007. * no longer be a compound page.
  1008. */
  1009. size = page_size(compound_head(page));
  1010. if (behavior == MADV_SOFT_OFFLINE) {
  1011. pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
  1012. pfn, start);
  1013. ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
  1014. } else {
  1015. pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
  1016. pfn, start);
  1017. ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
  1018. if (ret == -EOPNOTSUPP)
  1019. ret = 0;
  1020. }
  1021. if (ret)
  1022. return ret;
  1023. }
  1024. return 0;
  1025. }
  1026. #endif
  1027. static bool
  1028. madvise_behavior_valid(int behavior)
  1029. {
  1030. switch (behavior) {
  1031. case MADV_DOFORK:
  1032. case MADV_DONTFORK:
  1033. case MADV_NORMAL:
  1034. case MADV_SEQUENTIAL:
  1035. case MADV_RANDOM:
  1036. case MADV_REMOVE:
  1037. case MADV_WILLNEED:
  1038. case MADV_DONTNEED:
  1039. case MADV_DONTNEED_LOCKED:
  1040. case MADV_FREE:
  1041. case MADV_COLD:
  1042. case MADV_PAGEOUT:
  1043. case MADV_POPULATE_READ:
  1044. case MADV_POPULATE_WRITE:
  1045. #ifdef CONFIG_KSM
  1046. case MADV_MERGEABLE:
  1047. case MADV_UNMERGEABLE:
  1048. #endif
  1049. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1050. case MADV_HUGEPAGE:
  1051. case MADV_NOHUGEPAGE:
  1052. case MADV_COLLAPSE:
  1053. #endif
  1054. case MADV_DONTDUMP:
  1055. case MADV_DODUMP:
  1056. case MADV_WIPEONFORK:
  1057. case MADV_KEEPONFORK:
  1058. #ifdef CONFIG_MEMORY_FAILURE
  1059. case MADV_SOFT_OFFLINE:
  1060. case MADV_HWPOISON:
  1061. #endif
  1062. return true;
  1063. default:
  1064. return false;
  1065. }
  1066. }
  1067. static bool process_madvise_behavior_valid(int behavior)
  1068. {
  1069. switch (behavior) {
  1070. case MADV_COLD:
  1071. case MADV_PAGEOUT:
  1072. case MADV_WILLNEED:
  1073. case MADV_COLLAPSE:
  1074. return true;
  1075. default:
  1076. return false;
  1077. }
  1078. }
  1079. /*
  1080. * Walk the vmas in range [start,end), and call the visit function on each one.
  1081. * The visit function will get start and end parameters that cover the overlap
  1082. * between the current vma and the original range. Any unmapped regions in the
  1083. * original range will result in this function returning -ENOMEM while still
  1084. * calling the visit function on all of the existing vmas in the range.
  1085. * Must be called with the mmap_lock held for reading or writing.
  1086. */
  1087. static
  1088. int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
  1089. unsigned long end, unsigned long arg,
  1090. int (*visit)(struct vm_area_struct *vma,
  1091. struct vm_area_struct **prev, unsigned long start,
  1092. unsigned long end, unsigned long arg))
  1093. {
  1094. struct vm_area_struct *vma;
  1095. struct vm_area_struct *prev;
  1096. unsigned long tmp;
  1097. int unmapped_error = 0;
  1098. /*
  1099. * If the interval [start,end) covers some unmapped address
  1100. * ranges, just ignore them, but return -ENOMEM at the end.
  1101. * - different from the way of handling in mlock etc.
  1102. */
  1103. vma = find_vma_prev(mm, start, &prev);
  1104. if (vma && start > vma->vm_start)
  1105. prev = vma;
  1106. for (;;) {
  1107. int error;
  1108. /* Still start < end. */
  1109. if (!vma)
  1110. return -ENOMEM;
  1111. /* Here start < (end|vma->vm_end). */
  1112. if (start < vma->vm_start) {
  1113. unmapped_error = -ENOMEM;
  1114. start = vma->vm_start;
  1115. if (start >= end)
  1116. break;
  1117. }
  1118. /* Here vma->vm_start <= start < (end|vma->vm_end) */
  1119. tmp = vma->vm_end;
  1120. if (end < tmp)
  1121. tmp = end;
  1122. /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  1123. error = visit(vma, &prev, start, tmp, arg);
  1124. if (error)
  1125. return error;
  1126. start = tmp;
  1127. if (prev && start < prev->vm_end)
  1128. start = prev->vm_end;
  1129. if (start >= end)
  1130. break;
  1131. if (prev)
  1132. vma = find_vma(mm, prev->vm_end);
  1133. else /* madvise_remove dropped mmap_lock */
  1134. vma = find_vma(mm, start);
  1135. }
  1136. return unmapped_error;
  1137. }
  1138. #ifdef CONFIG_ANON_VMA_NAME
  1139. static int madvise_vma_anon_name(struct vm_area_struct *vma,
  1140. struct vm_area_struct **prev,
  1141. unsigned long start, unsigned long end,
  1142. unsigned long anon_name)
  1143. {
  1144. int error;
  1145. /* Only anonymous mappings can be named */
  1146. if (vma->vm_file && !vma_is_anon_shmem(vma))
  1147. return -EBADF;
  1148. error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
  1149. (struct anon_vma_name *)anon_name);
  1150. /*
  1151. * madvise() returns EAGAIN if kernel resources, such as
  1152. * slab, are temporarily unavailable.
  1153. */
  1154. if (error == -ENOMEM)
  1155. error = -EAGAIN;
  1156. return error;
  1157. }
  1158. int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  1159. unsigned long len_in, struct anon_vma_name *anon_name)
  1160. {
  1161. unsigned long end;
  1162. unsigned long len;
  1163. if (start & ~PAGE_MASK)
  1164. return -EINVAL;
  1165. len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  1166. /* Check to see whether len was rounded up from small -ve to zero */
  1167. if (len_in && !len)
  1168. return -EINVAL;
  1169. end = start + len;
  1170. if (end < start)
  1171. return -EINVAL;
  1172. if (end == start)
  1173. return 0;
  1174. return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
  1175. madvise_vma_anon_name);
  1176. }
  1177. #endif /* CONFIG_ANON_VMA_NAME */
  1178. /*
  1179. * The madvise(2) system call.
  1180. *
  1181. * Applications can use madvise() to advise the kernel how it should
  1182. * handle paging I/O in this VM area. The idea is to help the kernel
  1183. * use appropriate read-ahead and caching techniques. The information
  1184. * provided is advisory only, and can be safely disregarded by the
  1185. * kernel without affecting the correct operation of the application.
  1186. *
  1187. * behavior values:
  1188. * MADV_NORMAL - the default behavior is to read clusters. This
  1189. * results in some read-ahead and read-behind.
  1190. * MADV_RANDOM - the system should read the minimum amount of data
  1191. * on any access, since it is unlikely that the appli-
  1192. * cation will need more than what it asks for.
  1193. * MADV_SEQUENTIAL - pages in the given range will probably be accessed
  1194. * once, so they can be aggressively read ahead, and
  1195. * can be freed soon after they are accessed.
  1196. * MADV_WILLNEED - the application is notifying the system to read
  1197. * some pages ahead.
  1198. * MADV_DONTNEED - the application is finished with the given range,
  1199. * so the kernel can free resources associated with it.
  1200. * MADV_FREE - the application marks pages in the given range as lazy free,
  1201. * where actual purges are postponed until memory pressure happens.
  1202. * MADV_REMOVE - the application wants to free up the given range of
  1203. * pages and associated backing store.
  1204. * MADV_DONTFORK - omit this area from child's address space when forking:
  1205. * typically, to avoid COWing pages pinned by get_user_pages().
  1206. * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
  1207. * MADV_WIPEONFORK - present the child process with zero-filled memory in this
  1208. * range after a fork.
  1209. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
  1210. * MADV_HWPOISON - trigger memory error handler as if the given memory range
  1211. * were corrupted by unrecoverable hardware memory failure.
  1212. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
  1213. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
  1214. * this area with pages of identical content from other such areas.
  1215. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  1216. * MADV_HUGEPAGE - the application wants to back the given range by transparent
  1217. * huge pages in the future. Existing pages might be coalesced and
  1218. * new pages might be allocated as THP.
  1219. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  1220. * transparent huge pages so the existing pages will not be
  1221. * coalesced into THP and new pages will not be allocated as THP.
  1222. * MADV_COLLAPSE - synchronously coalesce pages into new THP.
  1223. * MADV_DONTDUMP - the application wants to prevent pages in the given range
  1224. * from being included in its core dump.
  1225. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  1226. * MADV_COLD - the application is not expected to use this memory soon,
  1227. * deactivate pages in this range so that they can be reclaimed
  1228. * easily if memory pressure happens.
  1229. * MADV_PAGEOUT - the application is not expected to use this memory soon,
  1230. * page out the pages in this range immediately.
  1231. * MADV_POPULATE_READ - populate (prefault) page tables readable by
  1232. * triggering read faults if required
  1233. * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
  1234. * triggering write faults if required
  1235. *
  1236. * return values:
  1237. * zero - success
  1238. * -EINVAL - start + len < 0, start is not page-aligned,
  1239. * "behavior" is not a valid value, or application
  1240. * is attempting to release locked or shared pages,
  1241. * or the specified address range includes file, Huge TLB,
  1242. * MAP_SHARED or VMPFNMAP range.
  1243. * -ENOMEM - addresses in the specified range are not currently
  1244. * mapped, or are outside the AS of the process.
  1245. * -EIO - an I/O error occurred while paging in data.
  1246. * -EBADF - map exists, but area maps something that isn't a file.
  1247. * -EAGAIN - a kernel resource was temporarily unavailable.
  1248. * -EPERM - memory is sealed.
  1249. */
  1250. int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
  1251. {
  1252. unsigned long end;
  1253. int error;
  1254. int write;
  1255. size_t len;
  1256. struct blk_plug plug;
  1257. if (!madvise_behavior_valid(behavior))
  1258. return -EINVAL;
  1259. if (!PAGE_ALIGNED(start))
  1260. return -EINVAL;
  1261. len = PAGE_ALIGN(len_in);
  1262. /* Check to see whether len was rounded up from small -ve to zero */
  1263. if (len_in && !len)
  1264. return -EINVAL;
  1265. end = start + len;
  1266. if (end < start)
  1267. return -EINVAL;
  1268. if (end == start)
  1269. return 0;
  1270. #ifdef CONFIG_MEMORY_FAILURE
  1271. if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  1272. return madvise_inject_error(behavior, start, start + len_in);
  1273. #endif
  1274. write = madvise_need_mmap_write(behavior);
  1275. if (write) {
  1276. if (mmap_write_lock_killable(mm))
  1277. return -EINTR;
  1278. } else {
  1279. mmap_read_lock(mm);
  1280. }
  1281. start = untagged_addr_remote(mm, start);
  1282. end = start + len;
  1283. blk_start_plug(&plug);
  1284. switch (behavior) {
  1285. case MADV_POPULATE_READ:
  1286. case MADV_POPULATE_WRITE:
  1287. error = madvise_populate(mm, start, end, behavior);
  1288. break;
  1289. default:
  1290. error = madvise_walk_vmas(mm, start, end, behavior,
  1291. madvise_vma_behavior);
  1292. break;
  1293. }
  1294. blk_finish_plug(&plug);
  1295. if (write)
  1296. mmap_write_unlock(mm);
  1297. else
  1298. mmap_read_unlock(mm);
  1299. return error;
  1300. }
  1301. SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  1302. {
  1303. return do_madvise(current->mm, start, len_in, behavior);
  1304. }
  1305. SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  1306. size_t, vlen, int, behavior, unsigned int, flags)
  1307. {
  1308. ssize_t ret;
  1309. struct iovec iovstack[UIO_FASTIOV];
  1310. struct iovec *iov = iovstack;
  1311. struct iov_iter iter;
  1312. struct task_struct *task;
  1313. struct mm_struct *mm;
  1314. size_t total_len;
  1315. unsigned int f_flags;
  1316. if (flags != 0) {
  1317. ret = -EINVAL;
  1318. goto out;
  1319. }
  1320. ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
  1321. if (ret < 0)
  1322. goto out;
  1323. task = pidfd_get_task(pidfd, &f_flags);
  1324. if (IS_ERR(task)) {
  1325. ret = PTR_ERR(task);
  1326. goto free_iov;
  1327. }
  1328. if (!process_madvise_behavior_valid(behavior)) {
  1329. ret = -EINVAL;
  1330. goto release_task;
  1331. }
  1332. /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
  1333. mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
  1334. if (IS_ERR_OR_NULL(mm)) {
  1335. ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
  1336. goto release_task;
  1337. }
  1338. /*
  1339. * Require CAP_SYS_NICE for influencing process performance. Note that
  1340. * only non-destructive hints are currently supported.
  1341. */
  1342. if (mm != current->mm && !capable(CAP_SYS_NICE)) {
  1343. ret = -EPERM;
  1344. goto release_mm;
  1345. }
  1346. total_len = iov_iter_count(&iter);
  1347. while (iov_iter_count(&iter)) {
  1348. ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
  1349. iter_iov_len(&iter), behavior);
  1350. if (ret < 0)
  1351. break;
  1352. iov_iter_advance(&iter, iter_iov_len(&iter));
  1353. }
  1354. ret = (total_len - iov_iter_count(&iter)) ? : ret;
  1355. release_mm:
  1356. mmput(mm);
  1357. release_task:
  1358. put_task_struct(task);
  1359. free_iov:
  1360. kfree(iov);
  1361. out:
  1362. return ret;
  1363. }