vma.c 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * VMA-specific functions.
  4. */
  5. #include "vma_internal.h"
  6. #include "vma.h"
  7. static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
  8. {
  9. struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
  10. if (!mpol_equal(vmg->policy, vma_policy(vma)))
  11. return false;
  12. /*
  13. * VM_SOFTDIRTY should not prevent from VMA merging, if we
  14. * match the flags but dirty bit -- the caller should mark
  15. * merged VMA as dirty. If dirty bit won't be excluded from
  16. * comparison, we increase pressure on the memory system forcing
  17. * the kernel to generate new VMAs when old one could be
  18. * extended instead.
  19. */
  20. if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
  21. return false;
  22. if (vma->vm_file != vmg->file)
  23. return false;
  24. if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
  25. return false;
  26. if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
  27. return false;
  28. return true;
  29. }
  30. static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  31. struct anon_vma *anon_vma2, struct vm_area_struct *vma)
  32. {
  33. /*
  34. * The list_is_singular() test is to avoid merging VMA cloned from
  35. * parents. This can improve scalability caused by anon_vma lock.
  36. */
  37. if ((!anon_vma1 || !anon_vma2) && (!vma ||
  38. list_is_singular(&vma->anon_vma_chain)))
  39. return true;
  40. return anon_vma1 == anon_vma2;
  41. }
  42. /* Are the anon_vma's belonging to each VMA compatible with one another? */
  43. static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
  44. struct vm_area_struct *vma2)
  45. {
  46. return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
  47. }
  48. /*
  49. * init_multi_vma_prep() - Initializer for struct vma_prepare
  50. * @vp: The vma_prepare struct
  51. * @vma: The vma that will be altered once locked
  52. * @next: The next vma if it is to be adjusted
  53. * @remove: The first vma to be removed
  54. * @remove2: The second vma to be removed
  55. */
  56. static void init_multi_vma_prep(struct vma_prepare *vp,
  57. struct vm_area_struct *vma,
  58. struct vm_area_struct *next,
  59. struct vm_area_struct *remove,
  60. struct vm_area_struct *remove2)
  61. {
  62. memset(vp, 0, sizeof(struct vma_prepare));
  63. vp->vma = vma;
  64. vp->anon_vma = vma->anon_vma;
  65. vp->remove = remove;
  66. vp->remove2 = remove2;
  67. vp->adj_next = next;
  68. if (!vp->anon_vma && next)
  69. vp->anon_vma = next->anon_vma;
  70. vp->file = vma->vm_file;
  71. if (vp->file)
  72. vp->mapping = vma->vm_file->f_mapping;
  73. }
  74. /*
  75. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  76. * in front of (at a lower virtual address and file offset than) the vma.
  77. *
  78. * We cannot merge two vmas if they have differently assigned (non-NULL)
  79. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  80. *
  81. * We don't check here for the merged mmap wrapping around the end of pagecache
  82. * indices (16TB on ia32) because do_mmap() does not permit mmap's which
  83. * wrap, nor mmaps which cover the final page at index -1UL.
  84. *
  85. * We assume the vma may be removed as part of the merge.
  86. */
  87. static bool can_vma_merge_before(struct vma_merge_struct *vmg)
  88. {
  89. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  90. if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
  91. is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
  92. if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
  93. return true;
  94. }
  95. return false;
  96. }
  97. /*
  98. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  99. * beyond (at a higher virtual address and file offset than) the vma.
  100. *
  101. * We cannot merge two vmas if they have differently assigned (non-NULL)
  102. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  103. *
  104. * We assume that vma is not removed as part of the merge.
  105. */
  106. static bool can_vma_merge_after(struct vma_merge_struct *vmg)
  107. {
  108. if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
  109. is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
  110. if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
  111. return true;
  112. }
  113. return false;
  114. }
  115. static void __vma_link_file(struct vm_area_struct *vma,
  116. struct address_space *mapping)
  117. {
  118. if (vma_is_shared_maywrite(vma))
  119. mapping_allow_writable(mapping);
  120. flush_dcache_mmap_lock(mapping);
  121. vma_interval_tree_insert(vma, &mapping->i_mmap);
  122. flush_dcache_mmap_unlock(mapping);
  123. }
  124. /*
  125. * Requires inode->i_mapping->i_mmap_rwsem
  126. */
  127. static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  128. struct address_space *mapping)
  129. {
  130. if (vma_is_shared_maywrite(vma))
  131. mapping_unmap_writable(mapping);
  132. flush_dcache_mmap_lock(mapping);
  133. vma_interval_tree_remove(vma, &mapping->i_mmap);
  134. flush_dcache_mmap_unlock(mapping);
  135. }
  136. /*
  137. * vma_prepare() - Helper function for handling locking VMAs prior to altering
  138. * @vp: The initialized vma_prepare struct
  139. */
  140. static void vma_prepare(struct vma_prepare *vp)
  141. {
  142. if (vp->file) {
  143. uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
  144. if (vp->adj_next)
  145. uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
  146. vp->adj_next->vm_end);
  147. i_mmap_lock_write(vp->mapping);
  148. if (vp->insert && vp->insert->vm_file) {
  149. /*
  150. * Put into interval tree now, so instantiated pages
  151. * are visible to arm/parisc __flush_dcache_page
  152. * throughout; but we cannot insert into address
  153. * space until vma start or end is updated.
  154. */
  155. __vma_link_file(vp->insert,
  156. vp->insert->vm_file->f_mapping);
  157. }
  158. }
  159. if (vp->anon_vma) {
  160. anon_vma_lock_write(vp->anon_vma);
  161. anon_vma_interval_tree_pre_update_vma(vp->vma);
  162. if (vp->adj_next)
  163. anon_vma_interval_tree_pre_update_vma(vp->adj_next);
  164. }
  165. if (vp->file) {
  166. flush_dcache_mmap_lock(vp->mapping);
  167. vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
  168. if (vp->adj_next)
  169. vma_interval_tree_remove(vp->adj_next,
  170. &vp->mapping->i_mmap);
  171. }
  172. }
  173. /*
  174. * vma_complete- Helper function for handling the unlocking after altering VMAs,
  175. * or for inserting a VMA.
  176. *
  177. * @vp: The vma_prepare struct
  178. * @vmi: The vma iterator
  179. * @mm: The mm_struct
  180. */
  181. static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
  182. struct mm_struct *mm)
  183. {
  184. if (vp->file) {
  185. if (vp->adj_next)
  186. vma_interval_tree_insert(vp->adj_next,
  187. &vp->mapping->i_mmap);
  188. vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
  189. flush_dcache_mmap_unlock(vp->mapping);
  190. }
  191. if (vp->remove && vp->file) {
  192. __remove_shared_vm_struct(vp->remove, vp->mapping);
  193. if (vp->remove2)
  194. __remove_shared_vm_struct(vp->remove2, vp->mapping);
  195. } else if (vp->insert) {
  196. /*
  197. * split_vma has split insert from vma, and needs
  198. * us to insert it before dropping the locks
  199. * (it may either follow vma or precede it).
  200. */
  201. vma_iter_store(vmi, vp->insert);
  202. mm->map_count++;
  203. }
  204. if (vp->anon_vma) {
  205. anon_vma_interval_tree_post_update_vma(vp->vma);
  206. if (vp->adj_next)
  207. anon_vma_interval_tree_post_update_vma(vp->adj_next);
  208. anon_vma_unlock_write(vp->anon_vma);
  209. }
  210. if (vp->file) {
  211. i_mmap_unlock_write(vp->mapping);
  212. uprobe_mmap(vp->vma);
  213. if (vp->adj_next)
  214. uprobe_mmap(vp->adj_next);
  215. }
  216. if (vp->remove) {
  217. again:
  218. vma_mark_detached(vp->remove, true);
  219. if (vp->file) {
  220. uprobe_munmap(vp->remove, vp->remove->vm_start,
  221. vp->remove->vm_end);
  222. fput(vp->file);
  223. }
  224. if (vp->remove->anon_vma)
  225. anon_vma_merge(vp->vma, vp->remove);
  226. mm->map_count--;
  227. mpol_put(vma_policy(vp->remove));
  228. if (!vp->remove2)
  229. WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
  230. vm_area_free(vp->remove);
  231. /*
  232. * In mprotect's case 6 (see comments on vma_merge),
  233. * we are removing both mid and next vmas
  234. */
  235. if (vp->remove2) {
  236. vp->remove = vp->remove2;
  237. vp->remove2 = NULL;
  238. goto again;
  239. }
  240. }
  241. if (vp->insert && vp->file)
  242. uprobe_mmap(vp->insert);
  243. }
  244. /*
  245. * init_vma_prep() - Initializer wrapper for vma_prepare struct
  246. * @vp: The vma_prepare struct
  247. * @vma: The vma that will be altered once locked
  248. */
  249. static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
  250. {
  251. init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
  252. }
  253. /*
  254. * Can the proposed VMA be merged with the left (previous) VMA taking into
  255. * account the start position of the proposed range.
  256. */
  257. static bool can_vma_merge_left(struct vma_merge_struct *vmg)
  258. {
  259. return vmg->prev && vmg->prev->vm_end == vmg->start &&
  260. can_vma_merge_after(vmg);
  261. }
  262. /*
  263. * Can the proposed VMA be merged with the right (next) VMA taking into
  264. * account the end position of the proposed range.
  265. *
  266. * In addition, if we can merge with the left VMA, ensure that left and right
  267. * anon_vma's are also compatible.
  268. */
  269. static bool can_vma_merge_right(struct vma_merge_struct *vmg,
  270. bool can_merge_left)
  271. {
  272. if (!vmg->next || vmg->end != vmg->next->vm_start ||
  273. !can_vma_merge_before(vmg))
  274. return false;
  275. if (!can_merge_left)
  276. return true;
  277. /*
  278. * If we can merge with prev (left) and next (right), indicating that
  279. * each VMA's anon_vma is compatible with the proposed anon_vma, this
  280. * does not mean prev and next are compatible with EACH OTHER.
  281. *
  282. * We therefore check this in addition to mergeability to either side.
  283. */
  284. return are_anon_vmas_compatible(vmg->prev, vmg->next);
  285. }
  286. /*
  287. * Close a vm structure and free it.
  288. */
  289. void remove_vma(struct vm_area_struct *vma, bool unreachable)
  290. {
  291. might_sleep();
  292. vma_close(vma);
  293. if (vma->vm_file)
  294. fput(vma->vm_file);
  295. mpol_put(vma_policy(vma));
  296. if (unreachable)
  297. __vm_area_free(vma);
  298. else
  299. vm_area_free(vma);
  300. }
  301. /*
  302. * Get rid of page table information in the indicated region.
  303. *
  304. * Called with the mm semaphore held.
  305. */
  306. void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
  307. struct vm_area_struct *prev, struct vm_area_struct *next)
  308. {
  309. struct mm_struct *mm = vma->vm_mm;
  310. struct mmu_gather tlb;
  311. lru_add_drain();
  312. tlb_gather_mmu(&tlb, mm);
  313. update_hiwater_rss(mm);
  314. unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
  315. /* mm_wr_locked = */ true);
  316. mas_set(mas, vma->vm_end);
  317. free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
  318. next ? next->vm_start : USER_PGTABLES_CEILING,
  319. /* mm_wr_locked = */ true);
  320. tlb_finish_mmu(&tlb);
  321. }
  322. /*
  323. * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
  324. * has already been checked or doesn't make sense to fail.
  325. * VMA Iterator will point to the original VMA.
  326. */
  327. static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  328. unsigned long addr, int new_below)
  329. {
  330. struct vma_prepare vp;
  331. struct vm_area_struct *new;
  332. int err;
  333. WARN_ON(vma->vm_start >= addr);
  334. WARN_ON(vma->vm_end <= addr);
  335. if (vma->vm_ops && vma->vm_ops->may_split) {
  336. err = vma->vm_ops->may_split(vma, addr);
  337. if (err)
  338. return err;
  339. }
  340. new = vm_area_dup(vma);
  341. if (!new)
  342. return -ENOMEM;
  343. if (new_below) {
  344. new->vm_end = addr;
  345. } else {
  346. new->vm_start = addr;
  347. new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
  348. }
  349. err = -ENOMEM;
  350. vma_iter_config(vmi, new->vm_start, new->vm_end);
  351. if (vma_iter_prealloc(vmi, new))
  352. goto out_free_vma;
  353. err = vma_dup_policy(vma, new);
  354. if (err)
  355. goto out_free_vmi;
  356. err = anon_vma_clone(new, vma);
  357. if (err)
  358. goto out_free_mpol;
  359. if (new->vm_file)
  360. get_file(new->vm_file);
  361. if (new->vm_ops && new->vm_ops->open)
  362. new->vm_ops->open(new);
  363. vma_start_write(vma);
  364. vma_start_write(new);
  365. init_vma_prep(&vp, vma);
  366. vp.insert = new;
  367. vma_prepare(&vp);
  368. /*
  369. * Get rid of huge pages and shared page tables straddling the split
  370. * boundary.
  371. */
  372. vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
  373. if (is_vm_hugetlb_page(vma))
  374. hugetlb_split(vma, addr);
  375. if (new_below) {
  376. vma->vm_start = addr;
  377. vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
  378. } else {
  379. vma->vm_end = addr;
  380. }
  381. /* vma_complete stores the new vma */
  382. vma_complete(&vp, vmi, vma->vm_mm);
  383. validate_mm(vma->vm_mm);
  384. /* Success. */
  385. if (new_below)
  386. vma_next(vmi);
  387. else
  388. vma_prev(vmi);
  389. return 0;
  390. out_free_mpol:
  391. mpol_put(vma_policy(new));
  392. out_free_vmi:
  393. vma_iter_free(vmi);
  394. out_free_vma:
  395. vm_area_free(new);
  396. return err;
  397. }
  398. /*
  399. * Split a vma into two pieces at address 'addr', a new vma is allocated
  400. * either for the first part or the tail.
  401. */
  402. static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  403. unsigned long addr, int new_below)
  404. {
  405. if (vma->vm_mm->map_count >= sysctl_max_map_count)
  406. return -ENOMEM;
  407. return __split_vma(vmi, vma, addr, new_below);
  408. }
  409. /*
  410. * vma has some anon_vma assigned, and is already inserted on that
  411. * anon_vma's interval trees.
  412. *
  413. * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
  414. * vma must be removed from the anon_vma's interval trees using
  415. * anon_vma_interval_tree_pre_update_vma().
  416. *
  417. * After the update, the vma will be reinserted using
  418. * anon_vma_interval_tree_post_update_vma().
  419. *
  420. * The entire update must be protected by exclusive mmap_lock and by
  421. * the root anon_vma's mutex.
  422. */
  423. void
  424. anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
  425. {
  426. struct anon_vma_chain *avc;
  427. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  428. anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
  429. }
  430. void
  431. anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
  432. {
  433. struct anon_vma_chain *avc;
  434. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  435. anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
  436. }
  437. /*
  438. * dup_anon_vma() - Helper function to duplicate anon_vma
  439. * @dst: The destination VMA
  440. * @src: The source VMA
  441. * @dup: Pointer to the destination VMA when successful.
  442. *
  443. * Returns: 0 on success.
  444. */
  445. static int dup_anon_vma(struct vm_area_struct *dst,
  446. struct vm_area_struct *src, struct vm_area_struct **dup)
  447. {
  448. /*
  449. * Easily overlooked: when mprotect shifts the boundary, make sure the
  450. * expanding vma has anon_vma set if the shrinking vma had, to cover any
  451. * anon pages imported.
  452. */
  453. if (src->anon_vma && !dst->anon_vma) {
  454. int ret;
  455. vma_assert_write_locked(dst);
  456. dst->anon_vma = src->anon_vma;
  457. ret = anon_vma_clone(dst, src);
  458. if (ret)
  459. return ret;
  460. *dup = dst;
  461. }
  462. return 0;
  463. }
  464. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  465. void validate_mm(struct mm_struct *mm)
  466. {
  467. int bug = 0;
  468. int i = 0;
  469. struct vm_area_struct *vma;
  470. VMA_ITERATOR(vmi, mm, 0);
  471. mt_validate(&mm->mm_mt);
  472. for_each_vma(vmi, vma) {
  473. #ifdef CONFIG_DEBUG_VM_RB
  474. struct anon_vma *anon_vma = vma->anon_vma;
  475. struct anon_vma_chain *avc;
  476. #endif
  477. unsigned long vmi_start, vmi_end;
  478. bool warn = 0;
  479. vmi_start = vma_iter_addr(&vmi);
  480. vmi_end = vma_iter_end(&vmi);
  481. if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
  482. warn = 1;
  483. if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
  484. warn = 1;
  485. if (warn) {
  486. pr_emerg("issue in %s\n", current->comm);
  487. dump_stack();
  488. dump_vma(vma);
  489. pr_emerg("tree range: %px start %lx end %lx\n", vma,
  490. vmi_start, vmi_end - 1);
  491. vma_iter_dump_tree(&vmi);
  492. }
  493. #ifdef CONFIG_DEBUG_VM_RB
  494. if (anon_vma) {
  495. anon_vma_lock_read(anon_vma);
  496. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  497. anon_vma_interval_tree_verify(avc);
  498. anon_vma_unlock_read(anon_vma);
  499. }
  500. #endif
  501. i++;
  502. }
  503. if (i != mm->map_count) {
  504. pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
  505. bug = 1;
  506. }
  507. VM_BUG_ON_MM(bug, mm);
  508. }
  509. #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
  510. /* Actually perform the VMA merge operation. */
  511. static int commit_merge(struct vma_merge_struct *vmg,
  512. struct vm_area_struct *adjust,
  513. struct vm_area_struct *remove,
  514. struct vm_area_struct *remove2,
  515. long adj_start,
  516. bool expanded)
  517. {
  518. struct vma_prepare vp;
  519. init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
  520. VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
  521. vp.anon_vma != adjust->anon_vma);
  522. if (expanded) {
  523. /* Note: vma iterator must be pointing to 'start'. */
  524. vma_iter_config(vmg->vmi, vmg->start, vmg->end);
  525. } else {
  526. vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
  527. adjust->vm_end);
  528. }
  529. if (vma_iter_prealloc(vmg->vmi, vmg->vma))
  530. return -ENOMEM;
  531. vma_prepare(&vp);
  532. vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
  533. vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
  534. if (expanded)
  535. vma_iter_store(vmg->vmi, vmg->vma);
  536. if (adj_start) {
  537. adjust->vm_start += adj_start;
  538. adjust->vm_pgoff += PHYS_PFN(adj_start);
  539. if (adj_start < 0) {
  540. WARN_ON(expanded);
  541. vma_iter_store(vmg->vmi, adjust);
  542. }
  543. }
  544. vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
  545. return 0;
  546. }
  547. /* We can only remove VMAs when merging if they do not have a close hook. */
  548. static bool can_merge_remove_vma(struct vm_area_struct *vma)
  549. {
  550. return !vma->vm_ops || !vma->vm_ops->close;
  551. }
  552. /*
  553. * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
  554. * attributes modified.
  555. *
  556. * @vmg: Describes the modifications being made to a VMA and associated
  557. * metadata.
  558. *
  559. * When the attributes of a range within a VMA change, then it might be possible
  560. * for immediately adjacent VMAs to be merged into that VMA due to having
  561. * identical properties.
  562. *
  563. * This function checks for the existence of any such mergeable VMAs and updates
  564. * the maple tree describing the @vmg->vma->vm_mm address space to account for
  565. * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
  566. *
  567. * As part of this operation, if a merge occurs, the @vmg object will have its
  568. * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
  569. * calls to this function should reset these fields.
  570. *
  571. * Returns: The merged VMA if merge succeeds, or NULL otherwise.
  572. *
  573. * ASSUMPTIONS:
  574. * - The caller must assign the VMA to be modifed to @vmg->vma.
  575. * - The caller must have set @vmg->prev to the previous VMA, if there is one.
  576. * - The caller must not set @vmg->next, as we determine this.
  577. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  578. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
  579. */
  580. static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
  581. {
  582. struct vm_area_struct *vma = vmg->vma;
  583. struct vm_area_struct *prev = vmg->prev;
  584. struct vm_area_struct *next, *res;
  585. struct vm_area_struct *anon_dup = NULL;
  586. struct vm_area_struct *adjust = NULL;
  587. unsigned long start = vmg->start;
  588. unsigned long end = vmg->end;
  589. bool left_side = vma && start == vma->vm_start;
  590. bool right_side = vma && end == vma->vm_end;
  591. int err = 0;
  592. long adj_start = 0;
  593. bool merge_will_delete_vma, merge_will_delete_next;
  594. bool merge_left, merge_right, merge_both;
  595. bool expanded;
  596. mmap_assert_write_locked(vmg->mm);
  597. VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
  598. VM_WARN_ON(vmg->next); /* We set this. */
  599. VM_WARN_ON(prev && start <= prev->vm_start);
  600. VM_WARN_ON(start >= end);
  601. /*
  602. * If vma == prev, then we are offset into a VMA. Otherwise, if we are
  603. * not, we must span a portion of the VMA.
  604. */
  605. VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
  606. vmg->end > vma->vm_end));
  607. /* The vmi must be positioned within vmg->vma. */
  608. VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
  609. vma_iter_addr(vmg->vmi) < vma->vm_end));
  610. vmg->state = VMA_MERGE_NOMERGE;
  611. /*
  612. * If a special mapping or if the range being modified is neither at the
  613. * furthermost left or right side of the VMA, then we have no chance of
  614. * merging and should abort.
  615. */
  616. if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
  617. return NULL;
  618. if (left_side)
  619. merge_left = can_vma_merge_left(vmg);
  620. else
  621. merge_left = false;
  622. if (right_side) {
  623. next = vmg->next = vma_iter_next_range(vmg->vmi);
  624. vma_iter_prev_range(vmg->vmi);
  625. merge_right = can_vma_merge_right(vmg, merge_left);
  626. } else {
  627. merge_right = false;
  628. next = NULL;
  629. }
  630. if (merge_left) /* If merging prev, position iterator there. */
  631. vma_prev(vmg->vmi);
  632. else if (!merge_right) /* If we have nothing to merge, abort. */
  633. return NULL;
  634. merge_both = merge_left && merge_right;
  635. /* If we span the entire VMA, a merge implies it will be deleted. */
  636. merge_will_delete_vma = left_side && right_side;
  637. /*
  638. * If we need to remove vma in its entirety but are unable to do so,
  639. * we have no sensible recourse but to abort the merge.
  640. */
  641. if (merge_will_delete_vma && !can_merge_remove_vma(vma))
  642. return NULL;
  643. /*
  644. * If we merge both VMAs, then next is also deleted. This implies
  645. * merge_will_delete_vma also.
  646. */
  647. merge_will_delete_next = merge_both;
  648. /*
  649. * If we cannot delete next, then we can reduce the operation to merging
  650. * prev and vma (thereby deleting vma).
  651. */
  652. if (merge_will_delete_next && !can_merge_remove_vma(next)) {
  653. merge_will_delete_next = false;
  654. merge_right = false;
  655. merge_both = false;
  656. }
  657. /* No matter what happens, we will be adjusting vma. */
  658. vma_start_write(vma);
  659. if (merge_left)
  660. vma_start_write(prev);
  661. if (merge_right)
  662. vma_start_write(next);
  663. if (merge_both) {
  664. /*
  665. * |<----->|
  666. * |-------*********-------|
  667. * prev vma next
  668. * extend delete delete
  669. */
  670. vmg->vma = prev;
  671. vmg->start = prev->vm_start;
  672. vmg->end = next->vm_end;
  673. vmg->pgoff = prev->vm_pgoff;
  674. /*
  675. * We already ensured anon_vma compatibility above, so now it's
  676. * simply a case of, if prev has no anon_vma object, which of
  677. * next or vma contains the anon_vma we must duplicate.
  678. */
  679. err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
  680. } else if (merge_left) {
  681. /*
  682. * |<----->| OR
  683. * |<--------->|
  684. * |-------*************
  685. * prev vma
  686. * extend shrink/delete
  687. */
  688. vmg->vma = prev;
  689. vmg->start = prev->vm_start;
  690. vmg->pgoff = prev->vm_pgoff;
  691. if (!merge_will_delete_vma) {
  692. adjust = vma;
  693. adj_start = vmg->end - vma->vm_start;
  694. }
  695. err = dup_anon_vma(prev, vma, &anon_dup);
  696. } else { /* merge_right */
  697. /*
  698. * |<----->| OR
  699. * |<--------->|
  700. * *************-------|
  701. * vma next
  702. * shrink/delete extend
  703. */
  704. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  705. VM_WARN_ON(!merge_right);
  706. /* If we are offset into a VMA, then prev must be vma. */
  707. VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
  708. if (merge_will_delete_vma) {
  709. vmg->vma = next;
  710. vmg->end = next->vm_end;
  711. vmg->pgoff = next->vm_pgoff - pglen;
  712. } else {
  713. /*
  714. * We shrink vma and expand next.
  715. *
  716. * IMPORTANT: This is the ONLY case where the final
  717. * merged VMA is NOT vmg->vma, but rather vmg->next.
  718. */
  719. vmg->start = vma->vm_start;
  720. vmg->end = start;
  721. vmg->pgoff = vma->vm_pgoff;
  722. adjust = next;
  723. adj_start = -(vma->vm_end - start);
  724. }
  725. err = dup_anon_vma(next, vma, &anon_dup);
  726. }
  727. /*
  728. * In nearly all cases, we expand vmg->vma. There is one exception -
  729. * merge_right where we partially span the VMA. In this case we shrink
  730. * the end of vmg->vma and adjust the start of vmg->next accordingly.
  731. */
  732. expanded = !merge_right || merge_will_delete_vma;
  733. if (err || commit_merge(vmg, adjust,
  734. merge_will_delete_vma ? vma : NULL,
  735. merge_will_delete_next ? next : NULL,
  736. adj_start, expanded))
  737. goto abort;
  738. res = merge_left ? prev : next;
  739. khugepaged_enter_vma(res, vmg->flags);
  740. vmg->state = VMA_MERGE_SUCCESS;
  741. return res;
  742. abort:
  743. vma_iter_set(vmg->vmi, start);
  744. vma_iter_load(vmg->vmi);
  745. if (anon_dup)
  746. unlink_anon_vmas(anon_dup);
  747. /*
  748. * This means we have failed to clone anon_vma's correctly, but no
  749. * actual changes to VMAs have occurred, so no harm no foul - if the
  750. * user doesn't want this reported and instead just wants to give up on
  751. * the merge, allow it.
  752. */
  753. if (!vmg->give_up_on_oom)
  754. vmg->state = VMA_MERGE_ERROR_NOMEM;
  755. return NULL;
  756. }
  757. /*
  758. * vma_merge_new_range - Attempt to merge a new VMA into address space
  759. *
  760. * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
  761. * (exclusive), which we try to merge with any adjacent VMAs if possible.
  762. *
  763. * We are about to add a VMA to the address space starting at @vmg->start and
  764. * ending at @vmg->end. There are three different possible scenarios:
  765. *
  766. * 1. There is a VMA with identical properties immediately adjacent to the
  767. * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
  768. * EXPAND that VMA:
  769. *
  770. * Proposed: |-----| or |-----|
  771. * Existing: |----| |----|
  772. *
  773. * 2. There are VMAs with identical properties immediately adjacent to the
  774. * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
  775. * EXPAND the former and REMOVE the latter:
  776. *
  777. * Proposed: |-----|
  778. * Existing: |----| |----|
  779. *
  780. * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
  781. * VMAs do not have identical attributes - NO MERGE POSSIBLE.
  782. *
  783. * In instances where we can merge, this function returns the expanded VMA which
  784. * will have its range adjusted accordingly and the underlying maple tree also
  785. * adjusted.
  786. *
  787. * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
  788. * to the VMA we expanded.
  789. *
  790. * This function adjusts @vmg to provide @vmg->next if not already specified,
  791. * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
  792. *
  793. * ASSUMPTIONS:
  794. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  795. * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
  796. other than VMAs that will be unmapped should the operation succeed.
  797. * - The caller must have specified the previous vma in @vmg->prev.
  798. * - The caller must have specified the next vma in @vmg->next.
  799. * - The caller must have positioned the vmi at or before the gap.
  800. */
  801. struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
  802. {
  803. struct vm_area_struct *prev = vmg->prev;
  804. struct vm_area_struct *next = vmg->next;
  805. unsigned long start = vmg->start;
  806. unsigned long end = vmg->end;
  807. pgoff_t pgoff = vmg->pgoff;
  808. pgoff_t pglen = PHYS_PFN(end - start);
  809. bool can_merge_left, can_merge_right;
  810. bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
  811. mmap_assert_write_locked(vmg->mm);
  812. VM_WARN_ON(vmg->vma);
  813. /* vmi must point at or before the gap. */
  814. VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
  815. vmg->state = VMA_MERGE_NOMERGE;
  816. /* Special VMAs are unmergeable, also if no prev/next. */
  817. if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
  818. return NULL;
  819. can_merge_left = can_vma_merge_left(vmg);
  820. can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
  821. /* If we can merge with the next VMA, adjust vmg accordingly. */
  822. if (can_merge_right) {
  823. vmg->end = next->vm_end;
  824. vmg->vma = next;
  825. vmg->pgoff = next->vm_pgoff - pglen;
  826. }
  827. /* If we can merge with the previous VMA, adjust vmg accordingly. */
  828. if (can_merge_left) {
  829. vmg->start = prev->vm_start;
  830. vmg->vma = prev;
  831. vmg->pgoff = prev->vm_pgoff;
  832. /*
  833. * If this merge would result in removal of the next VMA but we
  834. * are not permitted to do so, reduce the operation to merging
  835. * prev and vma.
  836. */
  837. if (can_merge_right && !can_merge_remove_vma(next))
  838. vmg->end = end;
  839. /* In expand-only case we are already positioned at prev. */
  840. if (!just_expand) {
  841. /* Equivalent to going to the previous range. */
  842. vma_prev(vmg->vmi);
  843. }
  844. }
  845. /*
  846. * Now try to expand adjacent VMA(s). This takes care of removing the
  847. * following VMA if we have VMAs on both sides.
  848. */
  849. if (vmg->vma && !vma_expand(vmg)) {
  850. khugepaged_enter_vma(vmg->vma, vmg->flags);
  851. vmg->state = VMA_MERGE_SUCCESS;
  852. return vmg->vma;
  853. }
  854. /* If expansion failed, reset state. Allows us to retry merge later. */
  855. if (!just_expand) {
  856. vmg->vma = NULL;
  857. vmg->start = start;
  858. vmg->end = end;
  859. vmg->pgoff = pgoff;
  860. if (vmg->vma == prev)
  861. vma_iter_set(vmg->vmi, start);
  862. }
  863. return NULL;
  864. }
  865. /*
  866. * vma_expand - Expand an existing VMA
  867. *
  868. * @vmg: Describes a VMA expansion operation.
  869. *
  870. * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
  871. * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
  872. * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
  873. * vmg->next needs to be handled by the caller.
  874. *
  875. * Returns: 0 on success.
  876. *
  877. * ASSUMPTIONS:
  878. * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
  879. * - The caller must have set @vmg->vma and @vmg->next.
  880. */
  881. int vma_expand(struct vma_merge_struct *vmg)
  882. {
  883. struct vm_area_struct *anon_dup = NULL;
  884. bool remove_next = false;
  885. struct vm_area_struct *vma = vmg->vma;
  886. struct vm_area_struct *next = vmg->next;
  887. mmap_assert_write_locked(vmg->mm);
  888. vma_start_write(vma);
  889. if (next && (vma != next) && (vmg->end == next->vm_end)) {
  890. int ret;
  891. remove_next = true;
  892. /* This should already have been checked by this point. */
  893. VM_WARN_ON(!can_merge_remove_vma(next));
  894. vma_start_write(next);
  895. ret = dup_anon_vma(vma, next, &anon_dup);
  896. if (ret)
  897. return ret;
  898. }
  899. /* Not merging but overwriting any part of next is not handled. */
  900. VM_WARN_ON(next && !remove_next &&
  901. next != vma && vmg->end > next->vm_start);
  902. /* Only handles expanding */
  903. VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
  904. if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
  905. goto nomem;
  906. return 0;
  907. nomem:
  908. if (anon_dup)
  909. unlink_anon_vmas(anon_dup);
  910. /*
  911. * If the user requests that we just give upon OOM, we are safe to do so
  912. * here, as commit merge provides this contract to us. Nothing has been
  913. * changed - no harm no foul, just don't report it.
  914. */
  915. if (!vmg->give_up_on_oom)
  916. vmg->state = VMA_MERGE_ERROR_NOMEM;
  917. return -ENOMEM;
  918. }
  919. /*
  920. * vma_shrink() - Reduce an existing VMAs memory area
  921. * @vmi: The vma iterator
  922. * @vma: The VMA to modify
  923. * @start: The new start
  924. * @end: The new end
  925. *
  926. * Returns: 0 on success, -ENOMEM otherwise
  927. */
  928. int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
  929. unsigned long start, unsigned long end, pgoff_t pgoff)
  930. {
  931. struct vma_prepare vp;
  932. WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
  933. if (vma->vm_start < start)
  934. vma_iter_config(vmi, vma->vm_start, start);
  935. else
  936. vma_iter_config(vmi, end, vma->vm_end);
  937. if (vma_iter_prealloc(vmi, NULL))
  938. return -ENOMEM;
  939. vma_start_write(vma);
  940. init_vma_prep(&vp, vma);
  941. vma_prepare(&vp);
  942. vma_adjust_trans_huge(vma, start, end, 0);
  943. vma_iter_clear(vmi);
  944. vma_set_range(vma, start, end, pgoff);
  945. vma_complete(&vp, vmi, vma->vm_mm);
  946. validate_mm(vma->vm_mm);
  947. return 0;
  948. }
  949. static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
  950. struct ma_state *mas_detach, bool mm_wr_locked)
  951. {
  952. struct mmu_gather tlb;
  953. if (!vms->clear_ptes) /* Nothing to do */
  954. return;
  955. /*
  956. * We can free page tables without write-locking mmap_lock because VMAs
  957. * were isolated before we downgraded mmap_lock.
  958. */
  959. mas_set(mas_detach, 1);
  960. lru_add_drain();
  961. tlb_gather_mmu(&tlb, vms->vma->vm_mm);
  962. update_hiwater_rss(vms->vma->vm_mm);
  963. unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
  964. vms->vma_count, mm_wr_locked);
  965. mas_set(mas_detach, 1);
  966. /* start and end may be different if there is no prev or next vma. */
  967. free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
  968. vms->unmap_end, mm_wr_locked);
  969. tlb_finish_mmu(&tlb);
  970. vms->clear_ptes = false;
  971. }
  972. void vms_clean_up_area(struct vma_munmap_struct *vms,
  973. struct ma_state *mas_detach)
  974. {
  975. struct vm_area_struct *vma;
  976. if (!vms->nr_pages)
  977. return;
  978. vms_clear_ptes(vms, mas_detach, true);
  979. mas_set(mas_detach, 0);
  980. mas_for_each(mas_detach, vma, ULONG_MAX)
  981. vma_close(vma);
  982. }
  983. /*
  984. * vms_complete_munmap_vmas() - Finish the munmap() operation
  985. * @vms: The vma munmap struct
  986. * @mas_detach: The maple state of the detached vmas
  987. *
  988. * This updates the mm_struct, unmaps the region, frees the resources
  989. * used for the munmap() and may downgrade the lock - if requested. Everything
  990. * needed to be done once the vma maple tree is updated.
  991. */
  992. void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
  993. struct ma_state *mas_detach)
  994. {
  995. struct vm_area_struct *vma;
  996. struct mm_struct *mm;
  997. mm = current->mm;
  998. mm->map_count -= vms->vma_count;
  999. mm->locked_vm -= vms->locked_vm;
  1000. if (vms->unlock)
  1001. mmap_write_downgrade(mm);
  1002. if (!vms->nr_pages)
  1003. return;
  1004. vms_clear_ptes(vms, mas_detach, !vms->unlock);
  1005. /* Update high watermark before we lower total_vm */
  1006. update_hiwater_vm(mm);
  1007. /* Stat accounting */
  1008. WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
  1009. /* Paranoid bookkeeping */
  1010. VM_WARN_ON(vms->exec_vm > mm->exec_vm);
  1011. VM_WARN_ON(vms->stack_vm > mm->stack_vm);
  1012. VM_WARN_ON(vms->data_vm > mm->data_vm);
  1013. mm->exec_vm -= vms->exec_vm;
  1014. mm->stack_vm -= vms->stack_vm;
  1015. mm->data_vm -= vms->data_vm;
  1016. /* Remove and clean up vmas */
  1017. mas_set(mas_detach, 0);
  1018. mas_for_each(mas_detach, vma, ULONG_MAX)
  1019. remove_vma(vma, /* unreachable = */ false);
  1020. vm_unacct_memory(vms->nr_accounted);
  1021. validate_mm(mm);
  1022. if (vms->unlock)
  1023. mmap_read_unlock(mm);
  1024. __mt_destroy(mas_detach->tree);
  1025. }
  1026. /*
  1027. * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
  1028. * for removal at a later date. Handles splitting first and last if necessary
  1029. * and marking the vmas as isolated.
  1030. *
  1031. * @vms: The vma munmap struct
  1032. * @mas_detach: The maple state tracking the detached tree
  1033. *
  1034. * Return: 0 on success, error otherwise
  1035. */
  1036. int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
  1037. struct ma_state *mas_detach)
  1038. {
  1039. struct vm_area_struct *next = NULL;
  1040. int error;
  1041. /*
  1042. * If we need to split any vma, do it now to save pain later.
  1043. * Does it split the first one?
  1044. */
  1045. if (vms->start > vms->vma->vm_start) {
  1046. /*
  1047. * Make sure that map_count on return from munmap() will
  1048. * not exceed its limit; but let map_count go just above
  1049. * its limit temporarily, to help free resources as expected.
  1050. */
  1051. if (vms->end < vms->vma->vm_end &&
  1052. vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
  1053. error = -ENOMEM;
  1054. goto map_count_exceeded;
  1055. }
  1056. /* Don't bother splitting the VMA if we can't unmap it anyway */
  1057. if (!can_modify_vma(vms->vma)) {
  1058. error = -EPERM;
  1059. goto start_split_failed;
  1060. }
  1061. error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
  1062. if (error)
  1063. goto start_split_failed;
  1064. }
  1065. vms->prev = vma_prev(vms->vmi);
  1066. if (vms->prev)
  1067. vms->unmap_start = vms->prev->vm_end;
  1068. /*
  1069. * Detach a range of VMAs from the mm. Using next as a temp variable as
  1070. * it is always overwritten.
  1071. */
  1072. for_each_vma_range(*(vms->vmi), next, vms->end) {
  1073. long nrpages;
  1074. if (!can_modify_vma(next)) {
  1075. error = -EPERM;
  1076. goto modify_vma_failed;
  1077. }
  1078. /* Does it split the end? */
  1079. if (next->vm_end > vms->end) {
  1080. error = __split_vma(vms->vmi, next, vms->end, 0);
  1081. if (error)
  1082. goto end_split_failed;
  1083. }
  1084. vma_start_write(next);
  1085. mas_set(mas_detach, vms->vma_count++);
  1086. error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
  1087. if (error)
  1088. goto munmap_gather_failed;
  1089. vma_mark_detached(next, true);
  1090. nrpages = vma_pages(next);
  1091. vms->nr_pages += nrpages;
  1092. if (next->vm_flags & VM_LOCKED)
  1093. vms->locked_vm += nrpages;
  1094. if (next->vm_flags & VM_ACCOUNT)
  1095. vms->nr_accounted += nrpages;
  1096. if (is_exec_mapping(next->vm_flags))
  1097. vms->exec_vm += nrpages;
  1098. else if (is_stack_mapping(next->vm_flags))
  1099. vms->stack_vm += nrpages;
  1100. else if (is_data_mapping(next->vm_flags))
  1101. vms->data_vm += nrpages;
  1102. if (unlikely(vms->uf)) {
  1103. /*
  1104. * If userfaultfd_unmap_prep returns an error the vmas
  1105. * will remain split, but userland will get a
  1106. * highly unexpected error anyway. This is no
  1107. * different than the case where the first of the two
  1108. * __split_vma fails, but we don't undo the first
  1109. * split, despite we could. This is unlikely enough
  1110. * failure that it's not worth optimizing it for.
  1111. */
  1112. error = userfaultfd_unmap_prep(next, vms->start,
  1113. vms->end, vms->uf);
  1114. if (error)
  1115. goto userfaultfd_error;
  1116. }
  1117. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  1118. BUG_ON(next->vm_start < vms->start);
  1119. BUG_ON(next->vm_start > vms->end);
  1120. #endif
  1121. }
  1122. vms->next = vma_next(vms->vmi);
  1123. if (vms->next)
  1124. vms->unmap_end = vms->next->vm_start;
  1125. #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
  1126. /* Make sure no VMAs are about to be lost. */
  1127. {
  1128. MA_STATE(test, mas_detach->tree, 0, 0);
  1129. struct vm_area_struct *vma_mas, *vma_test;
  1130. int test_count = 0;
  1131. vma_iter_set(vms->vmi, vms->start);
  1132. rcu_read_lock();
  1133. vma_test = mas_find(&test, vms->vma_count - 1);
  1134. for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
  1135. BUG_ON(vma_mas != vma_test);
  1136. test_count++;
  1137. vma_test = mas_next(&test, vms->vma_count - 1);
  1138. }
  1139. rcu_read_unlock();
  1140. BUG_ON(vms->vma_count != test_count);
  1141. }
  1142. #endif
  1143. while (vma_iter_addr(vms->vmi) > vms->start)
  1144. vma_iter_prev_range(vms->vmi);
  1145. vms->clear_ptes = true;
  1146. return 0;
  1147. userfaultfd_error:
  1148. munmap_gather_failed:
  1149. end_split_failed:
  1150. modify_vma_failed:
  1151. reattach_vmas(mas_detach);
  1152. start_split_failed:
  1153. map_count_exceeded:
  1154. return error;
  1155. }
  1156. /*
  1157. * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  1158. * @vmi: The vma iterator
  1159. * @vma: The starting vm_area_struct
  1160. * @mm: The mm_struct
  1161. * @start: The aligned start address to munmap.
  1162. * @end: The aligned end address to munmap.
  1163. * @uf: The userfaultfd list_head
  1164. * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
  1165. * success.
  1166. *
  1167. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1168. * lock held otherwise.
  1169. */
  1170. int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  1171. struct mm_struct *mm, unsigned long start, unsigned long end,
  1172. struct list_head *uf, bool unlock)
  1173. {
  1174. struct maple_tree mt_detach;
  1175. MA_STATE(mas_detach, &mt_detach, 0, 0);
  1176. mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
  1177. mt_on_stack(mt_detach);
  1178. struct vma_munmap_struct vms;
  1179. int error;
  1180. init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
  1181. error = vms_gather_munmap_vmas(&vms, &mas_detach);
  1182. if (error)
  1183. goto gather_failed;
  1184. error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
  1185. if (error)
  1186. goto clear_tree_failed;
  1187. /* Point of no return */
  1188. vms_complete_munmap_vmas(&vms, &mas_detach);
  1189. return 0;
  1190. clear_tree_failed:
  1191. reattach_vmas(&mas_detach);
  1192. gather_failed:
  1193. validate_mm(mm);
  1194. return error;
  1195. }
  1196. /*
  1197. * do_vmi_munmap() - munmap a given range.
  1198. * @vmi: The vma iterator
  1199. * @mm: The mm_struct
  1200. * @start: The start address to munmap
  1201. * @len: The length of the range to munmap
  1202. * @uf: The userfaultfd list_head
  1203. * @unlock: set to true if the user wants to drop the mmap_lock on success
  1204. *
  1205. * This function takes a @mas that is either pointing to the previous VMA or set
  1206. * to MA_START and sets it up to remove the mapping(s). The @len will be
  1207. * aligned.
  1208. *
  1209. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1210. * lock held otherwise.
  1211. */
  1212. int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
  1213. unsigned long start, size_t len, struct list_head *uf,
  1214. bool unlock)
  1215. {
  1216. unsigned long end;
  1217. struct vm_area_struct *vma;
  1218. if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
  1219. return -EINVAL;
  1220. end = start + PAGE_ALIGN(len);
  1221. if (end == start)
  1222. return -EINVAL;
  1223. /* Find the first overlapping VMA */
  1224. vma = vma_find(vmi, end);
  1225. if (!vma) {
  1226. if (unlock)
  1227. mmap_write_unlock(mm);
  1228. return 0;
  1229. }
  1230. return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
  1231. }
  1232. /*
  1233. * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
  1234. * context and anonymous VMA name within the range [start, end).
  1235. *
  1236. * As a result, we might be able to merge the newly modified VMA range with an
  1237. * adjacent VMA with identical properties.
  1238. *
  1239. * If no merge is possible and the range does not span the entirety of the VMA,
  1240. * we then need to split the VMA to accommodate the change.
  1241. *
  1242. * The function returns either the merged VMA, the original VMA if a split was
  1243. * required instead, or an error if the split failed.
  1244. */
  1245. static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
  1246. {
  1247. struct vm_area_struct *vma = vmg->vma;
  1248. unsigned long start = vmg->start;
  1249. unsigned long end = vmg->end;
  1250. struct vm_area_struct *merged;
  1251. /* First, try to merge. */
  1252. merged = vma_merge_existing_range(vmg);
  1253. if (merged)
  1254. return merged;
  1255. if (vmg_nomem(vmg))
  1256. return ERR_PTR(-ENOMEM);
  1257. /*
  1258. * Split can fail for reasons other than OOM, so if the user requests
  1259. * this it's probably a mistake.
  1260. */
  1261. VM_WARN_ON(vmg->give_up_on_oom &&
  1262. (vma->vm_start != start || vma->vm_end != end));
  1263. /* Split any preceding portion of the VMA. */
  1264. if (vma->vm_start < start) {
  1265. int err = split_vma(vmg->vmi, vma, start, 1);
  1266. if (err)
  1267. return ERR_PTR(err);
  1268. }
  1269. /* Split any trailing portion of the VMA. */
  1270. if (vma->vm_end > end) {
  1271. int err = split_vma(vmg->vmi, vma, end, 0);
  1272. if (err)
  1273. return ERR_PTR(err);
  1274. }
  1275. return vma;
  1276. }
  1277. struct vm_area_struct *vma_modify_flags(
  1278. struct vma_iterator *vmi, struct vm_area_struct *prev,
  1279. struct vm_area_struct *vma, unsigned long start, unsigned long end,
  1280. unsigned long new_flags)
  1281. {
  1282. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1283. vmg.flags = new_flags;
  1284. return vma_modify(&vmg);
  1285. }
  1286. struct vm_area_struct
  1287. *vma_modify_flags_name(struct vma_iterator *vmi,
  1288. struct vm_area_struct *prev,
  1289. struct vm_area_struct *vma,
  1290. unsigned long start,
  1291. unsigned long end,
  1292. unsigned long new_flags,
  1293. struct anon_vma_name *new_name)
  1294. {
  1295. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1296. vmg.flags = new_flags;
  1297. vmg.anon_name = new_name;
  1298. return vma_modify(&vmg);
  1299. }
  1300. struct vm_area_struct
  1301. *vma_modify_policy(struct vma_iterator *vmi,
  1302. struct vm_area_struct *prev,
  1303. struct vm_area_struct *vma,
  1304. unsigned long start, unsigned long end,
  1305. struct mempolicy *new_pol)
  1306. {
  1307. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1308. vmg.policy = new_pol;
  1309. return vma_modify(&vmg);
  1310. }
  1311. struct vm_area_struct
  1312. *vma_modify_flags_uffd(struct vma_iterator *vmi,
  1313. struct vm_area_struct *prev,
  1314. struct vm_area_struct *vma,
  1315. unsigned long start, unsigned long end,
  1316. unsigned long new_flags,
  1317. struct vm_userfaultfd_ctx new_ctx,
  1318. bool give_up_on_oom)
  1319. {
  1320. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1321. vmg.flags = new_flags;
  1322. vmg.uffd_ctx = new_ctx;
  1323. if (give_up_on_oom)
  1324. vmg.give_up_on_oom = true;
  1325. return vma_modify(&vmg);
  1326. }
  1327. /*
  1328. * Expand vma by delta bytes, potentially merging with an immediately adjacent
  1329. * VMA with identical properties.
  1330. */
  1331. struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
  1332. struct vm_area_struct *vma,
  1333. unsigned long delta)
  1334. {
  1335. VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
  1336. vmg.next = vma_iter_next_rewind(vmi, NULL);
  1337. vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
  1338. return vma_merge_new_range(&vmg);
  1339. }
  1340. void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
  1341. {
  1342. vb->count = 0;
  1343. }
  1344. static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
  1345. {
  1346. struct address_space *mapping;
  1347. int i;
  1348. mapping = vb->vmas[0]->vm_file->f_mapping;
  1349. i_mmap_lock_write(mapping);
  1350. for (i = 0; i < vb->count; i++) {
  1351. VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
  1352. __remove_shared_vm_struct(vb->vmas[i], mapping);
  1353. }
  1354. i_mmap_unlock_write(mapping);
  1355. unlink_file_vma_batch_init(vb);
  1356. }
  1357. void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
  1358. struct vm_area_struct *vma)
  1359. {
  1360. if (vma->vm_file == NULL)
  1361. return;
  1362. if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
  1363. vb->count == ARRAY_SIZE(vb->vmas))
  1364. unlink_file_vma_batch_process(vb);
  1365. vb->vmas[vb->count] = vma;
  1366. vb->count++;
  1367. }
  1368. void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
  1369. {
  1370. if (vb->count > 0)
  1371. unlink_file_vma_batch_process(vb);
  1372. }
  1373. /*
  1374. * Unlink a file-based vm structure from its interval tree, to hide
  1375. * vma from rmap and vmtruncate before freeing its page tables.
  1376. */
  1377. void unlink_file_vma(struct vm_area_struct *vma)
  1378. {
  1379. struct file *file = vma->vm_file;
  1380. if (file) {
  1381. struct address_space *mapping = file->f_mapping;
  1382. i_mmap_lock_write(mapping);
  1383. __remove_shared_vm_struct(vma, mapping);
  1384. i_mmap_unlock_write(mapping);
  1385. }
  1386. }
  1387. void vma_link_file(struct vm_area_struct *vma)
  1388. {
  1389. struct file *file = vma->vm_file;
  1390. struct address_space *mapping;
  1391. if (file) {
  1392. mapping = file->f_mapping;
  1393. i_mmap_lock_write(mapping);
  1394. __vma_link_file(vma, mapping);
  1395. i_mmap_unlock_write(mapping);
  1396. }
  1397. }
  1398. int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
  1399. {
  1400. VMA_ITERATOR(vmi, mm, 0);
  1401. vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
  1402. if (vma_iter_prealloc(&vmi, vma))
  1403. return -ENOMEM;
  1404. vma_start_write(vma);
  1405. vma_iter_store(&vmi, vma);
  1406. vma_link_file(vma);
  1407. mm->map_count++;
  1408. validate_mm(mm);
  1409. return 0;
  1410. }
  1411. /*
  1412. * Copy the vma structure to a new location in the same mm,
  1413. * prior to moving page table entries, to effect an mremap move.
  1414. */
  1415. struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  1416. unsigned long addr, unsigned long len, pgoff_t pgoff,
  1417. bool *need_rmap_locks)
  1418. {
  1419. struct vm_area_struct *vma = *vmap;
  1420. unsigned long vma_start = vma->vm_start;
  1421. struct mm_struct *mm = vma->vm_mm;
  1422. struct vm_area_struct *new_vma;
  1423. bool faulted_in_anon_vma = true;
  1424. VMA_ITERATOR(vmi, mm, addr);
  1425. VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
  1426. /*
  1427. * If anonymous vma has not yet been faulted, update new pgoff
  1428. * to match new location, to increase its chance of merging.
  1429. */
  1430. if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
  1431. pgoff = addr >> PAGE_SHIFT;
  1432. faulted_in_anon_vma = false;
  1433. }
  1434. new_vma = find_vma_prev(mm, addr, &vmg.prev);
  1435. if (new_vma && new_vma->vm_start < addr + len)
  1436. return NULL; /* should never get here */
  1437. vmg.vma = NULL; /* New VMA range. */
  1438. vmg.pgoff = pgoff;
  1439. vmg.next = vma_iter_next_rewind(&vmi, NULL);
  1440. new_vma = vma_merge_new_range(&vmg);
  1441. if (new_vma) {
  1442. /*
  1443. * Source vma may have been merged into new_vma
  1444. */
  1445. if (unlikely(vma_start >= new_vma->vm_start &&
  1446. vma_start < new_vma->vm_end)) {
  1447. /*
  1448. * The only way we can get a vma_merge with
  1449. * self during an mremap is if the vma hasn't
  1450. * been faulted in yet and we were allowed to
  1451. * reset the dst vma->vm_pgoff to the
  1452. * destination address of the mremap to allow
  1453. * the merge to happen. mremap must change the
  1454. * vm_pgoff linearity between src and dst vmas
  1455. * (in turn preventing a vma_merge) to be
  1456. * safe. It is only safe to keep the vm_pgoff
  1457. * linear if there are no pages mapped yet.
  1458. */
  1459. VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
  1460. *vmap = vma = new_vma;
  1461. }
  1462. *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
  1463. } else {
  1464. new_vma = vm_area_dup(vma);
  1465. if (!new_vma)
  1466. goto out;
  1467. vma_set_range(new_vma, addr, addr + len, pgoff);
  1468. if (vma_dup_policy(vma, new_vma))
  1469. goto out_free_vma;
  1470. if (anon_vma_clone(new_vma, vma))
  1471. goto out_free_mempol;
  1472. if (new_vma->vm_file)
  1473. get_file(new_vma->vm_file);
  1474. if (new_vma->vm_ops && new_vma->vm_ops->open)
  1475. new_vma->vm_ops->open(new_vma);
  1476. if (vma_link(mm, new_vma))
  1477. goto out_vma_link;
  1478. *need_rmap_locks = false;
  1479. }
  1480. return new_vma;
  1481. out_vma_link:
  1482. vma_close(new_vma);
  1483. if (new_vma->vm_file)
  1484. fput(new_vma->vm_file);
  1485. unlink_anon_vmas(new_vma);
  1486. out_free_mempol:
  1487. mpol_put(vma_policy(new_vma));
  1488. out_free_vma:
  1489. vm_area_free(new_vma);
  1490. out:
  1491. return NULL;
  1492. }
  1493. /*
  1494. * Rough compatibility check to quickly see if it's even worth looking
  1495. * at sharing an anon_vma.
  1496. *
  1497. * They need to have the same vm_file, and the flags can only differ
  1498. * in things that mprotect may change.
  1499. *
  1500. * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
  1501. * we can merge the two vma's. For example, we refuse to merge a vma if
  1502. * there is a vm_ops->close() function, because that indicates that the
  1503. * driver is doing some kind of reference counting. But that doesn't
  1504. * really matter for the anon_vma sharing case.
  1505. */
  1506. static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
  1507. {
  1508. return a->vm_end == b->vm_start &&
  1509. mpol_equal(vma_policy(a), vma_policy(b)) &&
  1510. a->vm_file == b->vm_file &&
  1511. !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
  1512. b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
  1513. }
  1514. /*
  1515. * Do some basic sanity checking to see if we can re-use the anon_vma
  1516. * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
  1517. * the same as 'old', the other will be the new one that is trying
  1518. * to share the anon_vma.
  1519. *
  1520. * NOTE! This runs with mmap_lock held for reading, so it is possible that
  1521. * the anon_vma of 'old' is concurrently in the process of being set up
  1522. * by another page fault trying to merge _that_. But that's ok: if it
  1523. * is being set up, that automatically means that it will be a singleton
  1524. * acceptable for merging, so we can do all of this optimistically. But
  1525. * we do that READ_ONCE() to make sure that we never re-load the pointer.
  1526. *
  1527. * IOW: that the "list_is_singular()" test on the anon_vma_chain only
  1528. * matters for the 'stable anon_vma' case (ie the thing we want to avoid
  1529. * is to return an anon_vma that is "complex" due to having gone through
  1530. * a fork).
  1531. *
  1532. * We also make sure that the two vma's are compatible (adjacent,
  1533. * and with the same memory policies). That's all stable, even with just
  1534. * a read lock on the mmap_lock.
  1535. */
  1536. static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
  1537. struct vm_area_struct *a,
  1538. struct vm_area_struct *b)
  1539. {
  1540. if (anon_vma_compatible(a, b)) {
  1541. struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
  1542. if (anon_vma && list_is_singular(&old->anon_vma_chain))
  1543. return anon_vma;
  1544. }
  1545. return NULL;
  1546. }
  1547. /*
  1548. * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  1549. * neighbouring vmas for a suitable anon_vma, before it goes off
  1550. * to allocate a new anon_vma. It checks because a repetitive
  1551. * sequence of mprotects and faults may otherwise lead to distinct
  1552. * anon_vmas being allocated, preventing vma merge in subsequent
  1553. * mprotect.
  1554. */
  1555. struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  1556. {
  1557. struct anon_vma *anon_vma = NULL;
  1558. struct vm_area_struct *prev, *next;
  1559. VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
  1560. /* Try next first. */
  1561. next = vma_iter_load(&vmi);
  1562. if (next) {
  1563. anon_vma = reusable_anon_vma(next, vma, next);
  1564. if (anon_vma)
  1565. return anon_vma;
  1566. }
  1567. prev = vma_prev(&vmi);
  1568. VM_BUG_ON_VMA(prev != vma, vma);
  1569. prev = vma_prev(&vmi);
  1570. /* Try prev next. */
  1571. if (prev)
  1572. anon_vma = reusable_anon_vma(prev, prev, vma);
  1573. /*
  1574. * We might reach here with anon_vma == NULL if we can't find
  1575. * any reusable anon_vma.
  1576. * There's no absolute need to look only at touching neighbours:
  1577. * we could search further afield for "compatible" anon_vmas.
  1578. * But it would probably just be a waste of time searching,
  1579. * or lead to too many vmas hanging off the same anon_vma.
  1580. * We're trying to allow mprotect remerging later on,
  1581. * not trying to minimize memory used for anon_vmas.
  1582. */
  1583. return anon_vma;
  1584. }
  1585. static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
  1586. {
  1587. return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
  1588. }
  1589. static bool vma_is_shared_writable(struct vm_area_struct *vma)
  1590. {
  1591. return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
  1592. (VM_WRITE | VM_SHARED);
  1593. }
  1594. static bool vma_fs_can_writeback(struct vm_area_struct *vma)
  1595. {
  1596. /* No managed pages to writeback. */
  1597. if (vma->vm_flags & VM_PFNMAP)
  1598. return false;
  1599. return vma->vm_file && vma->vm_file->f_mapping &&
  1600. mapping_can_writeback(vma->vm_file->f_mapping);
  1601. }
  1602. /*
  1603. * Does this VMA require the underlying folios to have their dirty state
  1604. * tracked?
  1605. */
  1606. bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
  1607. {
  1608. /* Only shared, writable VMAs require dirty tracking. */
  1609. if (!vma_is_shared_writable(vma))
  1610. return false;
  1611. /* Does the filesystem need to be notified? */
  1612. if (vm_ops_needs_writenotify(vma->vm_ops))
  1613. return true;
  1614. /*
  1615. * Even if the filesystem doesn't indicate a need for writenotify, if it
  1616. * can writeback, dirty tracking is still required.
  1617. */
  1618. return vma_fs_can_writeback(vma);
  1619. }
  1620. /*
  1621. * Some shared mappings will want the pages marked read-only
  1622. * to track write events. If so, we'll downgrade vm_page_prot
  1623. * to the private version (using protection_map[] without the
  1624. * VM_SHARED bit).
  1625. */
  1626. bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
  1627. {
  1628. /* If it was private or non-writable, the write bit is already clear */
  1629. if (!vma_is_shared_writable(vma))
  1630. return false;
  1631. /* The backer wishes to know when pages are first written to? */
  1632. if (vm_ops_needs_writenotify(vma->vm_ops))
  1633. return true;
  1634. /* The open routine did something to the protections that pgprot_modify
  1635. * won't preserve? */
  1636. if (pgprot_val(vm_page_prot) !=
  1637. pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
  1638. return false;
  1639. /*
  1640. * Do we need to track softdirty? hugetlb does not support softdirty
  1641. * tracking yet.
  1642. */
  1643. if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
  1644. return true;
  1645. /* Do we need write faults for uffd-wp tracking? */
  1646. if (userfaultfd_wp(vma))
  1647. return true;
  1648. /* Can the mapping track the dirty pages? */
  1649. return vma_fs_can_writeback(vma);
  1650. }
  1651. static DEFINE_MUTEX(mm_all_locks_mutex);
  1652. static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
  1653. {
  1654. if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1655. /*
  1656. * The LSB of head.next can't change from under us
  1657. * because we hold the mm_all_locks_mutex.
  1658. */
  1659. down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
  1660. /*
  1661. * We can safely modify head.next after taking the
  1662. * anon_vma->root->rwsem. If some other vma in this mm shares
  1663. * the same anon_vma we won't take it again.
  1664. *
  1665. * No need of atomic instructions here, head.next
  1666. * can't change from under us thanks to the
  1667. * anon_vma->root->rwsem.
  1668. */
  1669. if (__test_and_set_bit(0, (unsigned long *)
  1670. &anon_vma->root->rb_root.rb_root.rb_node))
  1671. BUG();
  1672. }
  1673. }
  1674. static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  1675. {
  1676. if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1677. /*
  1678. * AS_MM_ALL_LOCKS can't change from under us because
  1679. * we hold the mm_all_locks_mutex.
  1680. *
  1681. * Operations on ->flags have to be atomic because
  1682. * even if AS_MM_ALL_LOCKS is stable thanks to the
  1683. * mm_all_locks_mutex, there may be other cpus
  1684. * changing other bitflags in parallel to us.
  1685. */
  1686. if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
  1687. BUG();
  1688. down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
  1689. }
  1690. }
  1691. /*
  1692. * This operation locks against the VM for all pte/vma/mm related
  1693. * operations that could ever happen on a certain mm. This includes
  1694. * vmtruncate, try_to_unmap, and all page faults.
  1695. *
  1696. * The caller must take the mmap_lock in write mode before calling
  1697. * mm_take_all_locks(). The caller isn't allowed to release the
  1698. * mmap_lock until mm_drop_all_locks() returns.
  1699. *
  1700. * mmap_lock in write mode is required in order to block all operations
  1701. * that could modify pagetables and free pages without need of
  1702. * altering the vma layout. It's also needed in write mode to avoid new
  1703. * anon_vmas to be associated with existing vmas.
  1704. *
  1705. * A single task can't take more than one mm_take_all_locks() in a row
  1706. * or it would deadlock.
  1707. *
  1708. * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  1709. * mapping->flags avoid to take the same lock twice, if more than one
  1710. * vma in this mm is backed by the same anon_vma or address_space.
  1711. *
  1712. * We take locks in following order, accordingly to comment at beginning
  1713. * of mm/rmap.c:
  1714. * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
  1715. * hugetlb mapping);
  1716. * - all vmas marked locked
  1717. * - all i_mmap_rwsem locks;
  1718. * - all anon_vma->rwseml
  1719. *
  1720. * We can take all locks within these types randomly because the VM code
  1721. * doesn't nest them and we protected from parallel mm_take_all_locks() by
  1722. * mm_all_locks_mutex.
  1723. *
  1724. * mm_take_all_locks() and mm_drop_all_locks are expensive operations
  1725. * that may have to take thousand of locks.
  1726. *
  1727. * mm_take_all_locks() can fail if it's interrupted by signals.
  1728. */
  1729. int mm_take_all_locks(struct mm_struct *mm)
  1730. {
  1731. struct vm_area_struct *vma;
  1732. struct anon_vma_chain *avc;
  1733. VMA_ITERATOR(vmi, mm, 0);
  1734. mmap_assert_write_locked(mm);
  1735. mutex_lock(&mm_all_locks_mutex);
  1736. /*
  1737. * vma_start_write() does not have a complement in mm_drop_all_locks()
  1738. * because vma_start_write() is always asymmetrical; it marks a VMA as
  1739. * being written to until mmap_write_unlock() or mmap_write_downgrade()
  1740. * is reached.
  1741. */
  1742. for_each_vma(vmi, vma) {
  1743. if (signal_pending(current))
  1744. goto out_unlock;
  1745. vma_start_write(vma);
  1746. }
  1747. vma_iter_init(&vmi, mm, 0);
  1748. for_each_vma(vmi, vma) {
  1749. if (signal_pending(current))
  1750. goto out_unlock;
  1751. if (vma->vm_file && vma->vm_file->f_mapping &&
  1752. is_vm_hugetlb_page(vma))
  1753. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1754. }
  1755. vma_iter_init(&vmi, mm, 0);
  1756. for_each_vma(vmi, vma) {
  1757. if (signal_pending(current))
  1758. goto out_unlock;
  1759. if (vma->vm_file && vma->vm_file->f_mapping &&
  1760. !is_vm_hugetlb_page(vma))
  1761. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1762. }
  1763. vma_iter_init(&vmi, mm, 0);
  1764. for_each_vma(vmi, vma) {
  1765. if (signal_pending(current))
  1766. goto out_unlock;
  1767. if (vma->anon_vma)
  1768. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  1769. vm_lock_anon_vma(mm, avc->anon_vma);
  1770. }
  1771. return 0;
  1772. out_unlock:
  1773. mm_drop_all_locks(mm);
  1774. return -EINTR;
  1775. }
  1776. static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
  1777. {
  1778. if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1779. /*
  1780. * The LSB of head.next can't change to 0 from under
  1781. * us because we hold the mm_all_locks_mutex.
  1782. *
  1783. * We must however clear the bitflag before unlocking
  1784. * the vma so the users using the anon_vma->rb_root will
  1785. * never see our bitflag.
  1786. *
  1787. * No need of atomic instructions here, head.next
  1788. * can't change from under us until we release the
  1789. * anon_vma->root->rwsem.
  1790. */
  1791. if (!__test_and_clear_bit(0, (unsigned long *)
  1792. &anon_vma->root->rb_root.rb_root.rb_node))
  1793. BUG();
  1794. anon_vma_unlock_write(anon_vma);
  1795. }
  1796. }
  1797. static void vm_unlock_mapping(struct address_space *mapping)
  1798. {
  1799. if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1800. /*
  1801. * AS_MM_ALL_LOCKS can't change to 0 from under us
  1802. * because we hold the mm_all_locks_mutex.
  1803. */
  1804. i_mmap_unlock_write(mapping);
  1805. if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
  1806. &mapping->flags))
  1807. BUG();
  1808. }
  1809. }
  1810. /*
  1811. * The mmap_lock cannot be released by the caller until
  1812. * mm_drop_all_locks() returns.
  1813. */
  1814. void mm_drop_all_locks(struct mm_struct *mm)
  1815. {
  1816. struct vm_area_struct *vma;
  1817. struct anon_vma_chain *avc;
  1818. VMA_ITERATOR(vmi, mm, 0);
  1819. mmap_assert_write_locked(mm);
  1820. BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
  1821. for_each_vma(vmi, vma) {
  1822. if (vma->anon_vma)
  1823. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  1824. vm_unlock_anon_vma(avc->anon_vma);
  1825. if (vma->vm_file && vma->vm_file->f_mapping)
  1826. vm_unlock_mapping(vma->vm_file->f_mapping);
  1827. }
  1828. mutex_unlock(&mm_all_locks_mutex);
  1829. }