vma.c 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * VMA-specific functions.
  4. */
  5. #include "vma_internal.h"
  6. #include "vma.h"
  7. static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
  8. {
  9. struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
  10. if (!mpol_equal(vmg->policy, vma_policy(vma)))
  11. return false;
  12. /*
  13. * VM_SOFTDIRTY should not prevent from VMA merging, if we
  14. * match the flags but dirty bit -- the caller should mark
  15. * merged VMA as dirty. If dirty bit won't be excluded from
  16. * comparison, we increase pressure on the memory system forcing
  17. * the kernel to generate new VMAs when old one could be
  18. * extended instead.
  19. */
  20. if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
  21. return false;
  22. if (vma->vm_file != vmg->file)
  23. return false;
  24. if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
  25. return false;
  26. if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
  27. return false;
  28. return true;
  29. }
  30. static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  31. struct anon_vma *anon_vma2, struct vm_area_struct *vma)
  32. {
  33. /*
  34. * The list_is_singular() test is to avoid merging VMA cloned from
  35. * parents. This can improve scalability caused by anon_vma lock.
  36. */
  37. if ((!anon_vma1 || !anon_vma2) && (!vma ||
  38. list_is_singular(&vma->anon_vma_chain)))
  39. return true;
  40. return anon_vma1 == anon_vma2;
  41. }
  42. /* Are the anon_vma's belonging to each VMA compatible with one another? */
  43. static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
  44. struct vm_area_struct *vma2)
  45. {
  46. return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
  47. }
  48. /*
  49. * init_multi_vma_prep() - Initializer for struct vma_prepare
  50. * @vp: The vma_prepare struct
  51. * @vma: The vma that will be altered once locked
  52. * @next: The next vma if it is to be adjusted
  53. * @remove: The first vma to be removed
  54. * @remove2: The second vma to be removed
  55. */
  56. static void init_multi_vma_prep(struct vma_prepare *vp,
  57. struct vm_area_struct *vma,
  58. struct vm_area_struct *next,
  59. struct vm_area_struct *remove,
  60. struct vm_area_struct *remove2)
  61. {
  62. memset(vp, 0, sizeof(struct vma_prepare));
  63. vp->vma = vma;
  64. vp->anon_vma = vma->anon_vma;
  65. vp->remove = remove;
  66. vp->remove2 = remove2;
  67. vp->adj_next = next;
  68. if (!vp->anon_vma && next)
  69. vp->anon_vma = next->anon_vma;
  70. vp->file = vma->vm_file;
  71. if (vp->file)
  72. vp->mapping = vma->vm_file->f_mapping;
  73. }
  74. /*
  75. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  76. * in front of (at a lower virtual address and file offset than) the vma.
  77. *
  78. * We cannot merge two vmas if they have differently assigned (non-NULL)
  79. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  80. *
  81. * We don't check here for the merged mmap wrapping around the end of pagecache
  82. * indices (16TB on ia32) because do_mmap() does not permit mmap's which
  83. * wrap, nor mmaps which cover the final page at index -1UL.
  84. *
  85. * We assume the vma may be removed as part of the merge.
  86. */
  87. static bool can_vma_merge_before(struct vma_merge_struct *vmg)
  88. {
  89. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  90. if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
  91. is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
  92. if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
  93. return true;
  94. }
  95. return false;
  96. }
  97. /*
  98. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  99. * beyond (at a higher virtual address and file offset than) the vma.
  100. *
  101. * We cannot merge two vmas if they have differently assigned (non-NULL)
  102. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  103. *
  104. * We assume that vma is not removed as part of the merge.
  105. */
  106. static bool can_vma_merge_after(struct vma_merge_struct *vmg)
  107. {
  108. if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
  109. is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
  110. if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
  111. return true;
  112. }
  113. return false;
  114. }
  115. static void __vma_link_file(struct vm_area_struct *vma,
  116. struct address_space *mapping)
  117. {
  118. if (vma_is_shared_maywrite(vma))
  119. mapping_allow_writable(mapping);
  120. flush_dcache_mmap_lock(mapping);
  121. vma_interval_tree_insert(vma, &mapping->i_mmap);
  122. flush_dcache_mmap_unlock(mapping);
  123. }
  124. /*
  125. * Requires inode->i_mapping->i_mmap_rwsem
  126. */
  127. static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  128. struct address_space *mapping)
  129. {
  130. if (vma_is_shared_maywrite(vma))
  131. mapping_unmap_writable(mapping);
  132. flush_dcache_mmap_lock(mapping);
  133. vma_interval_tree_remove(vma, &mapping->i_mmap);
  134. flush_dcache_mmap_unlock(mapping);
  135. }
  136. /*
  137. * vma_prepare() - Helper function for handling locking VMAs prior to altering
  138. * @vp: The initialized vma_prepare struct
  139. */
  140. static void vma_prepare(struct vma_prepare *vp)
  141. {
  142. if (vp->file) {
  143. uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
  144. if (vp->adj_next)
  145. uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
  146. vp->adj_next->vm_end);
  147. i_mmap_lock_write(vp->mapping);
  148. if (vp->insert && vp->insert->vm_file) {
  149. /*
  150. * Put into interval tree now, so instantiated pages
  151. * are visible to arm/parisc __flush_dcache_page
  152. * throughout; but we cannot insert into address
  153. * space until vma start or end is updated.
  154. */
  155. __vma_link_file(vp->insert,
  156. vp->insert->vm_file->f_mapping);
  157. }
  158. }
  159. if (vp->anon_vma) {
  160. anon_vma_lock_write(vp->anon_vma);
  161. anon_vma_interval_tree_pre_update_vma(vp->vma);
  162. if (vp->adj_next)
  163. anon_vma_interval_tree_pre_update_vma(vp->adj_next);
  164. }
  165. if (vp->file) {
  166. flush_dcache_mmap_lock(vp->mapping);
  167. vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
  168. if (vp->adj_next)
  169. vma_interval_tree_remove(vp->adj_next,
  170. &vp->mapping->i_mmap);
  171. }
  172. }
  173. /*
  174. * vma_complete- Helper function for handling the unlocking after altering VMAs,
  175. * or for inserting a VMA.
  176. *
  177. * @vp: The vma_prepare struct
  178. * @vmi: The vma iterator
  179. * @mm: The mm_struct
  180. */
  181. static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
  182. struct mm_struct *mm)
  183. {
  184. if (vp->file) {
  185. if (vp->adj_next)
  186. vma_interval_tree_insert(vp->adj_next,
  187. &vp->mapping->i_mmap);
  188. vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
  189. flush_dcache_mmap_unlock(vp->mapping);
  190. }
  191. if (vp->remove && vp->file) {
  192. __remove_shared_vm_struct(vp->remove, vp->mapping);
  193. if (vp->remove2)
  194. __remove_shared_vm_struct(vp->remove2, vp->mapping);
  195. } else if (vp->insert) {
  196. /*
  197. * split_vma has split insert from vma, and needs
  198. * us to insert it before dropping the locks
  199. * (it may either follow vma or precede it).
  200. */
  201. vma_iter_store(vmi, vp->insert);
  202. mm->map_count++;
  203. }
  204. if (vp->anon_vma) {
  205. anon_vma_interval_tree_post_update_vma(vp->vma);
  206. if (vp->adj_next)
  207. anon_vma_interval_tree_post_update_vma(vp->adj_next);
  208. anon_vma_unlock_write(vp->anon_vma);
  209. }
  210. if (vp->file) {
  211. i_mmap_unlock_write(vp->mapping);
  212. uprobe_mmap(vp->vma);
  213. if (vp->adj_next)
  214. uprobe_mmap(vp->adj_next);
  215. }
  216. if (vp->remove) {
  217. again:
  218. vma_mark_detached(vp->remove, true);
  219. if (vp->file) {
  220. uprobe_munmap(vp->remove, vp->remove->vm_start,
  221. vp->remove->vm_end);
  222. fput(vp->file);
  223. }
  224. if (vp->remove->anon_vma)
  225. anon_vma_merge(vp->vma, vp->remove);
  226. mm->map_count--;
  227. mpol_put(vma_policy(vp->remove));
  228. if (!vp->remove2)
  229. WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
  230. vm_area_free(vp->remove);
  231. /*
  232. * In mprotect's case 6 (see comments on vma_merge),
  233. * we are removing both mid and next vmas
  234. */
  235. if (vp->remove2) {
  236. vp->remove = vp->remove2;
  237. vp->remove2 = NULL;
  238. goto again;
  239. }
  240. }
  241. if (vp->insert && vp->file)
  242. uprobe_mmap(vp->insert);
  243. }
  244. /*
  245. * init_vma_prep() - Initializer wrapper for vma_prepare struct
  246. * @vp: The vma_prepare struct
  247. * @vma: The vma that will be altered once locked
  248. */
  249. static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
  250. {
  251. init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
  252. }
  253. /*
  254. * Can the proposed VMA be merged with the left (previous) VMA taking into
  255. * account the start position of the proposed range.
  256. */
  257. static bool can_vma_merge_left(struct vma_merge_struct *vmg)
  258. {
  259. return vmg->prev && vmg->prev->vm_end == vmg->start &&
  260. can_vma_merge_after(vmg);
  261. }
  262. /*
  263. * Can the proposed VMA be merged with the right (next) VMA taking into
  264. * account the end position of the proposed range.
  265. *
  266. * In addition, if we can merge with the left VMA, ensure that left and right
  267. * anon_vma's are also compatible.
  268. */
  269. static bool can_vma_merge_right(struct vma_merge_struct *vmg,
  270. bool can_merge_left)
  271. {
  272. if (!vmg->next || vmg->end != vmg->next->vm_start ||
  273. !can_vma_merge_before(vmg))
  274. return false;
  275. if (!can_merge_left)
  276. return true;
  277. /*
  278. * If we can merge with prev (left) and next (right), indicating that
  279. * each VMA's anon_vma is compatible with the proposed anon_vma, this
  280. * does not mean prev and next are compatible with EACH OTHER.
  281. *
  282. * We therefore check this in addition to mergeability to either side.
  283. */
  284. return are_anon_vmas_compatible(vmg->prev, vmg->next);
  285. }
  286. /*
  287. * Close a vm structure and free it.
  288. */
  289. void remove_vma(struct vm_area_struct *vma, bool unreachable)
  290. {
  291. might_sleep();
  292. vma_close(vma);
  293. if (vma->vm_file)
  294. fput(vma->vm_file);
  295. mpol_put(vma_policy(vma));
  296. if (unreachable)
  297. __vm_area_free(vma);
  298. else
  299. vm_area_free(vma);
  300. }
  301. /*
  302. * Get rid of page table information in the indicated region.
  303. *
  304. * Called with the mm semaphore held.
  305. */
  306. void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
  307. struct vm_area_struct *prev, struct vm_area_struct *next)
  308. {
  309. struct mm_struct *mm = vma->vm_mm;
  310. struct mmu_gather tlb;
  311. lru_add_drain();
  312. tlb_gather_mmu(&tlb, mm);
  313. update_hiwater_rss(mm);
  314. unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
  315. /* mm_wr_locked = */ true);
  316. mas_set(mas, vma->vm_end);
  317. free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
  318. next ? next->vm_start : USER_PGTABLES_CEILING,
  319. /* mm_wr_locked = */ true);
  320. tlb_finish_mmu(&tlb);
  321. }
  322. /*
  323. * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
  324. * has already been checked or doesn't make sense to fail.
  325. * VMA Iterator will point to the original VMA.
  326. */
  327. static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  328. unsigned long addr, int new_below)
  329. {
  330. struct vma_prepare vp;
  331. struct vm_area_struct *new;
  332. int err;
  333. WARN_ON(vma->vm_start >= addr);
  334. WARN_ON(vma->vm_end <= addr);
  335. if (vma->vm_ops && vma->vm_ops->may_split) {
  336. err = vma->vm_ops->may_split(vma, addr);
  337. if (err)
  338. return err;
  339. }
  340. new = vm_area_dup(vma);
  341. if (!new)
  342. return -ENOMEM;
  343. if (new_below) {
  344. new->vm_end = addr;
  345. } else {
  346. new->vm_start = addr;
  347. new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
  348. }
  349. err = -ENOMEM;
  350. vma_iter_config(vmi, new->vm_start, new->vm_end);
  351. if (vma_iter_prealloc(vmi, new))
  352. goto out_free_vma;
  353. err = vma_dup_policy(vma, new);
  354. if (err)
  355. goto out_free_vmi;
  356. err = anon_vma_clone(new, vma);
  357. if (err)
  358. goto out_free_mpol;
  359. if (new->vm_file)
  360. get_file(new->vm_file);
  361. if (new->vm_ops && new->vm_ops->open)
  362. new->vm_ops->open(new);
  363. vma_start_write(vma);
  364. vma_start_write(new);
  365. init_vma_prep(&vp, vma);
  366. vp.insert = new;
  367. vma_prepare(&vp);
  368. vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
  369. if (new_below) {
  370. vma->vm_start = addr;
  371. vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
  372. } else {
  373. vma->vm_end = addr;
  374. }
  375. /* vma_complete stores the new vma */
  376. vma_complete(&vp, vmi, vma->vm_mm);
  377. validate_mm(vma->vm_mm);
  378. /* Success. */
  379. if (new_below)
  380. vma_next(vmi);
  381. else
  382. vma_prev(vmi);
  383. return 0;
  384. out_free_mpol:
  385. mpol_put(vma_policy(new));
  386. out_free_vmi:
  387. vma_iter_free(vmi);
  388. out_free_vma:
  389. vm_area_free(new);
  390. return err;
  391. }
  392. /*
  393. * Split a vma into two pieces at address 'addr', a new vma is allocated
  394. * either for the first part or the tail.
  395. */
  396. static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  397. unsigned long addr, int new_below)
  398. {
  399. if (vma->vm_mm->map_count >= sysctl_max_map_count)
  400. return -ENOMEM;
  401. return __split_vma(vmi, vma, addr, new_below);
  402. }
  403. /*
  404. * vma has some anon_vma assigned, and is already inserted on that
  405. * anon_vma's interval trees.
  406. *
  407. * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
  408. * vma must be removed from the anon_vma's interval trees using
  409. * anon_vma_interval_tree_pre_update_vma().
  410. *
  411. * After the update, the vma will be reinserted using
  412. * anon_vma_interval_tree_post_update_vma().
  413. *
  414. * The entire update must be protected by exclusive mmap_lock and by
  415. * the root anon_vma's mutex.
  416. */
  417. void
  418. anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
  419. {
  420. struct anon_vma_chain *avc;
  421. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  422. anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
  423. }
  424. void
  425. anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
  426. {
  427. struct anon_vma_chain *avc;
  428. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  429. anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
  430. }
  431. /*
  432. * dup_anon_vma() - Helper function to duplicate anon_vma
  433. * @dst: The destination VMA
  434. * @src: The source VMA
  435. * @dup: Pointer to the destination VMA when successful.
  436. *
  437. * Returns: 0 on success.
  438. */
  439. static int dup_anon_vma(struct vm_area_struct *dst,
  440. struct vm_area_struct *src, struct vm_area_struct **dup)
  441. {
  442. /*
  443. * Easily overlooked: when mprotect shifts the boundary, make sure the
  444. * expanding vma has anon_vma set if the shrinking vma had, to cover any
  445. * anon pages imported.
  446. */
  447. if (src->anon_vma && !dst->anon_vma) {
  448. int ret;
  449. vma_assert_write_locked(dst);
  450. dst->anon_vma = src->anon_vma;
  451. ret = anon_vma_clone(dst, src);
  452. if (ret)
  453. return ret;
  454. *dup = dst;
  455. }
  456. return 0;
  457. }
  458. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  459. void validate_mm(struct mm_struct *mm)
  460. {
  461. int bug = 0;
  462. int i = 0;
  463. struct vm_area_struct *vma;
  464. VMA_ITERATOR(vmi, mm, 0);
  465. mt_validate(&mm->mm_mt);
  466. for_each_vma(vmi, vma) {
  467. #ifdef CONFIG_DEBUG_VM_RB
  468. struct anon_vma *anon_vma = vma->anon_vma;
  469. struct anon_vma_chain *avc;
  470. #endif
  471. unsigned long vmi_start, vmi_end;
  472. bool warn = 0;
  473. vmi_start = vma_iter_addr(&vmi);
  474. vmi_end = vma_iter_end(&vmi);
  475. if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
  476. warn = 1;
  477. if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
  478. warn = 1;
  479. if (warn) {
  480. pr_emerg("issue in %s\n", current->comm);
  481. dump_stack();
  482. dump_vma(vma);
  483. pr_emerg("tree range: %px start %lx end %lx\n", vma,
  484. vmi_start, vmi_end - 1);
  485. vma_iter_dump_tree(&vmi);
  486. }
  487. #ifdef CONFIG_DEBUG_VM_RB
  488. if (anon_vma) {
  489. anon_vma_lock_read(anon_vma);
  490. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  491. anon_vma_interval_tree_verify(avc);
  492. anon_vma_unlock_read(anon_vma);
  493. }
  494. #endif
  495. i++;
  496. }
  497. if (i != mm->map_count) {
  498. pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
  499. bug = 1;
  500. }
  501. VM_BUG_ON_MM(bug, mm);
  502. }
  503. #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
  504. /* Actually perform the VMA merge operation. */
  505. static int commit_merge(struct vma_merge_struct *vmg,
  506. struct vm_area_struct *adjust,
  507. struct vm_area_struct *remove,
  508. struct vm_area_struct *remove2,
  509. long adj_start,
  510. bool expanded)
  511. {
  512. struct vma_prepare vp;
  513. init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
  514. VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
  515. vp.anon_vma != adjust->anon_vma);
  516. if (expanded) {
  517. /* Note: vma iterator must be pointing to 'start'. */
  518. vma_iter_config(vmg->vmi, vmg->start, vmg->end);
  519. } else {
  520. vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
  521. adjust->vm_end);
  522. }
  523. if (vma_iter_prealloc(vmg->vmi, vmg->vma))
  524. return -ENOMEM;
  525. vma_prepare(&vp);
  526. vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
  527. vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
  528. if (expanded)
  529. vma_iter_store(vmg->vmi, vmg->vma);
  530. if (adj_start) {
  531. adjust->vm_start += adj_start;
  532. adjust->vm_pgoff += PHYS_PFN(adj_start);
  533. if (adj_start < 0) {
  534. WARN_ON(expanded);
  535. vma_iter_store(vmg->vmi, adjust);
  536. }
  537. }
  538. vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
  539. return 0;
  540. }
  541. /* We can only remove VMAs when merging if they do not have a close hook. */
  542. static bool can_merge_remove_vma(struct vm_area_struct *vma)
  543. {
  544. return !vma->vm_ops || !vma->vm_ops->close;
  545. }
  546. /*
  547. * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
  548. * attributes modified.
  549. *
  550. * @vmg: Describes the modifications being made to a VMA and associated
  551. * metadata.
  552. *
  553. * When the attributes of a range within a VMA change, then it might be possible
  554. * for immediately adjacent VMAs to be merged into that VMA due to having
  555. * identical properties.
  556. *
  557. * This function checks for the existence of any such mergeable VMAs and updates
  558. * the maple tree describing the @vmg->vma->vm_mm address space to account for
  559. * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
  560. *
  561. * As part of this operation, if a merge occurs, the @vmg object will have its
  562. * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
  563. * calls to this function should reset these fields.
  564. *
  565. * Returns: The merged VMA if merge succeeds, or NULL otherwise.
  566. *
  567. * ASSUMPTIONS:
  568. * - The caller must assign the VMA to be modifed to @vmg->vma.
  569. * - The caller must have set @vmg->prev to the previous VMA, if there is one.
  570. * - The caller must not set @vmg->next, as we determine this.
  571. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  572. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
  573. */
  574. static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
  575. {
  576. struct vm_area_struct *vma = vmg->vma;
  577. struct vm_area_struct *prev = vmg->prev;
  578. struct vm_area_struct *next, *res;
  579. struct vm_area_struct *anon_dup = NULL;
  580. struct vm_area_struct *adjust = NULL;
  581. unsigned long start = vmg->start;
  582. unsigned long end = vmg->end;
  583. bool left_side = vma && start == vma->vm_start;
  584. bool right_side = vma && end == vma->vm_end;
  585. int err = 0;
  586. long adj_start = 0;
  587. bool merge_will_delete_vma, merge_will_delete_next;
  588. bool merge_left, merge_right, merge_both;
  589. bool expanded;
  590. mmap_assert_write_locked(vmg->mm);
  591. VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
  592. VM_WARN_ON(vmg->next); /* We set this. */
  593. VM_WARN_ON(prev && start <= prev->vm_start);
  594. VM_WARN_ON(start >= end);
  595. /*
  596. * If vma == prev, then we are offset into a VMA. Otherwise, if we are
  597. * not, we must span a portion of the VMA.
  598. */
  599. VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
  600. vmg->end > vma->vm_end));
  601. /* The vmi must be positioned within vmg->vma. */
  602. VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
  603. vma_iter_addr(vmg->vmi) < vma->vm_end));
  604. vmg->state = VMA_MERGE_NOMERGE;
  605. /*
  606. * If a special mapping or if the range being modified is neither at the
  607. * furthermost left or right side of the VMA, then we have no chance of
  608. * merging and should abort.
  609. */
  610. if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
  611. return NULL;
  612. if (left_side)
  613. merge_left = can_vma_merge_left(vmg);
  614. else
  615. merge_left = false;
  616. if (right_side) {
  617. next = vmg->next = vma_iter_next_range(vmg->vmi);
  618. vma_iter_prev_range(vmg->vmi);
  619. merge_right = can_vma_merge_right(vmg, merge_left);
  620. } else {
  621. merge_right = false;
  622. next = NULL;
  623. }
  624. if (merge_left) /* If merging prev, position iterator there. */
  625. vma_prev(vmg->vmi);
  626. else if (!merge_right) /* If we have nothing to merge, abort. */
  627. return NULL;
  628. merge_both = merge_left && merge_right;
  629. /* If we span the entire VMA, a merge implies it will be deleted. */
  630. merge_will_delete_vma = left_side && right_side;
  631. /*
  632. * If we need to remove vma in its entirety but are unable to do so,
  633. * we have no sensible recourse but to abort the merge.
  634. */
  635. if (merge_will_delete_vma && !can_merge_remove_vma(vma))
  636. return NULL;
  637. /*
  638. * If we merge both VMAs, then next is also deleted. This implies
  639. * merge_will_delete_vma also.
  640. */
  641. merge_will_delete_next = merge_both;
  642. /*
  643. * If we cannot delete next, then we can reduce the operation to merging
  644. * prev and vma (thereby deleting vma).
  645. */
  646. if (merge_will_delete_next && !can_merge_remove_vma(next)) {
  647. merge_will_delete_next = false;
  648. merge_right = false;
  649. merge_both = false;
  650. }
  651. /* No matter what happens, we will be adjusting vma. */
  652. vma_start_write(vma);
  653. if (merge_left)
  654. vma_start_write(prev);
  655. if (merge_right)
  656. vma_start_write(next);
  657. if (merge_both) {
  658. /*
  659. * |<----->|
  660. * |-------*********-------|
  661. * prev vma next
  662. * extend delete delete
  663. */
  664. vmg->vma = prev;
  665. vmg->start = prev->vm_start;
  666. vmg->end = next->vm_end;
  667. vmg->pgoff = prev->vm_pgoff;
  668. /*
  669. * We already ensured anon_vma compatibility above, so now it's
  670. * simply a case of, if prev has no anon_vma object, which of
  671. * next or vma contains the anon_vma we must duplicate.
  672. */
  673. err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
  674. } else if (merge_left) {
  675. /*
  676. * |<----->| OR
  677. * |<--------->|
  678. * |-------*************
  679. * prev vma
  680. * extend shrink/delete
  681. */
  682. vmg->vma = prev;
  683. vmg->start = prev->vm_start;
  684. vmg->pgoff = prev->vm_pgoff;
  685. if (!merge_will_delete_vma) {
  686. adjust = vma;
  687. adj_start = vmg->end - vma->vm_start;
  688. }
  689. err = dup_anon_vma(prev, vma, &anon_dup);
  690. } else { /* merge_right */
  691. /*
  692. * |<----->| OR
  693. * |<--------->|
  694. * *************-------|
  695. * vma next
  696. * shrink/delete extend
  697. */
  698. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  699. VM_WARN_ON(!merge_right);
  700. /* If we are offset into a VMA, then prev must be vma. */
  701. VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
  702. if (merge_will_delete_vma) {
  703. vmg->vma = next;
  704. vmg->end = next->vm_end;
  705. vmg->pgoff = next->vm_pgoff - pglen;
  706. } else {
  707. /*
  708. * We shrink vma and expand next.
  709. *
  710. * IMPORTANT: This is the ONLY case where the final
  711. * merged VMA is NOT vmg->vma, but rather vmg->next.
  712. */
  713. vmg->start = vma->vm_start;
  714. vmg->end = start;
  715. vmg->pgoff = vma->vm_pgoff;
  716. adjust = next;
  717. adj_start = -(vma->vm_end - start);
  718. }
  719. err = dup_anon_vma(next, vma, &anon_dup);
  720. }
  721. if (err)
  722. goto abort;
  723. /*
  724. * In nearly all cases, we expand vmg->vma. There is one exception -
  725. * merge_right where we partially span the VMA. In this case we shrink
  726. * the end of vmg->vma and adjust the start of vmg->next accordingly.
  727. */
  728. expanded = !merge_right || merge_will_delete_vma;
  729. if (commit_merge(vmg, adjust,
  730. merge_will_delete_vma ? vma : NULL,
  731. merge_will_delete_next ? next : NULL,
  732. adj_start, expanded)) {
  733. if (anon_dup)
  734. unlink_anon_vmas(anon_dup);
  735. vmg->state = VMA_MERGE_ERROR_NOMEM;
  736. return NULL;
  737. }
  738. res = merge_left ? prev : next;
  739. khugepaged_enter_vma(res, vmg->flags);
  740. vmg->state = VMA_MERGE_SUCCESS;
  741. return res;
  742. abort:
  743. vma_iter_set(vmg->vmi, start);
  744. vma_iter_load(vmg->vmi);
  745. vmg->state = VMA_MERGE_ERROR_NOMEM;
  746. return NULL;
  747. }
  748. /*
  749. * vma_merge_new_range - Attempt to merge a new VMA into address space
  750. *
  751. * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
  752. * (exclusive), which we try to merge with any adjacent VMAs if possible.
  753. *
  754. * We are about to add a VMA to the address space starting at @vmg->start and
  755. * ending at @vmg->end. There are three different possible scenarios:
  756. *
  757. * 1. There is a VMA with identical properties immediately adjacent to the
  758. * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
  759. * EXPAND that VMA:
  760. *
  761. * Proposed: |-----| or |-----|
  762. * Existing: |----| |----|
  763. *
  764. * 2. There are VMAs with identical properties immediately adjacent to the
  765. * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
  766. * EXPAND the former and REMOVE the latter:
  767. *
  768. * Proposed: |-----|
  769. * Existing: |----| |----|
  770. *
  771. * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
  772. * VMAs do not have identical attributes - NO MERGE POSSIBLE.
  773. *
  774. * In instances where we can merge, this function returns the expanded VMA which
  775. * will have its range adjusted accordingly and the underlying maple tree also
  776. * adjusted.
  777. *
  778. * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
  779. * to the VMA we expanded.
  780. *
  781. * This function adjusts @vmg to provide @vmg->next if not already specified,
  782. * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
  783. *
  784. * ASSUMPTIONS:
  785. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  786. * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
  787. other than VMAs that will be unmapped should the operation succeed.
  788. * - The caller must have specified the previous vma in @vmg->prev.
  789. * - The caller must have specified the next vma in @vmg->next.
  790. * - The caller must have positioned the vmi at or before the gap.
  791. */
  792. struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
  793. {
  794. struct vm_area_struct *prev = vmg->prev;
  795. struct vm_area_struct *next = vmg->next;
  796. unsigned long start = vmg->start;
  797. unsigned long end = vmg->end;
  798. pgoff_t pgoff = vmg->pgoff;
  799. pgoff_t pglen = PHYS_PFN(end - start);
  800. bool can_merge_left, can_merge_right;
  801. bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
  802. mmap_assert_write_locked(vmg->mm);
  803. VM_WARN_ON(vmg->vma);
  804. /* vmi must point at or before the gap. */
  805. VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
  806. vmg->state = VMA_MERGE_NOMERGE;
  807. /* Special VMAs are unmergeable, also if no prev/next. */
  808. if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
  809. return NULL;
  810. can_merge_left = can_vma_merge_left(vmg);
  811. can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
  812. /* If we can merge with the next VMA, adjust vmg accordingly. */
  813. if (can_merge_right) {
  814. vmg->end = next->vm_end;
  815. vmg->vma = next;
  816. vmg->pgoff = next->vm_pgoff - pglen;
  817. }
  818. /* If we can merge with the previous VMA, adjust vmg accordingly. */
  819. if (can_merge_left) {
  820. vmg->start = prev->vm_start;
  821. vmg->vma = prev;
  822. vmg->pgoff = prev->vm_pgoff;
  823. /*
  824. * If this merge would result in removal of the next VMA but we
  825. * are not permitted to do so, reduce the operation to merging
  826. * prev and vma.
  827. */
  828. if (can_merge_right && !can_merge_remove_vma(next))
  829. vmg->end = end;
  830. /* In expand-only case we are already positioned at prev. */
  831. if (!just_expand) {
  832. /* Equivalent to going to the previous range. */
  833. vma_prev(vmg->vmi);
  834. }
  835. }
  836. /*
  837. * Now try to expand adjacent VMA(s). This takes care of removing the
  838. * following VMA if we have VMAs on both sides.
  839. */
  840. if (vmg->vma && !vma_expand(vmg)) {
  841. khugepaged_enter_vma(vmg->vma, vmg->flags);
  842. vmg->state = VMA_MERGE_SUCCESS;
  843. return vmg->vma;
  844. }
  845. /* If expansion failed, reset state. Allows us to retry merge later. */
  846. if (!just_expand) {
  847. vmg->vma = NULL;
  848. vmg->start = start;
  849. vmg->end = end;
  850. vmg->pgoff = pgoff;
  851. if (vmg->vma == prev)
  852. vma_iter_set(vmg->vmi, start);
  853. }
  854. return NULL;
  855. }
  856. /*
  857. * vma_expand - Expand an existing VMA
  858. *
  859. * @vmg: Describes a VMA expansion operation.
  860. *
  861. * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
  862. * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
  863. * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
  864. * vmg->next needs to be handled by the caller.
  865. *
  866. * Returns: 0 on success.
  867. *
  868. * ASSUMPTIONS:
  869. * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
  870. * - The caller must have set @vmg->vma and @vmg->next.
  871. */
  872. int vma_expand(struct vma_merge_struct *vmg)
  873. {
  874. struct vm_area_struct *anon_dup = NULL;
  875. bool remove_next = false;
  876. struct vm_area_struct *vma = vmg->vma;
  877. struct vm_area_struct *next = vmg->next;
  878. mmap_assert_write_locked(vmg->mm);
  879. vma_start_write(vma);
  880. if (next && (vma != next) && (vmg->end == next->vm_end)) {
  881. int ret;
  882. remove_next = true;
  883. /* This should already have been checked by this point. */
  884. VM_WARN_ON(!can_merge_remove_vma(next));
  885. vma_start_write(next);
  886. ret = dup_anon_vma(vma, next, &anon_dup);
  887. if (ret)
  888. return ret;
  889. }
  890. /* Not merging but overwriting any part of next is not handled. */
  891. VM_WARN_ON(next && !remove_next &&
  892. next != vma && vmg->end > next->vm_start);
  893. /* Only handles expanding */
  894. VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
  895. if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
  896. goto nomem;
  897. return 0;
  898. nomem:
  899. vmg->state = VMA_MERGE_ERROR_NOMEM;
  900. if (anon_dup)
  901. unlink_anon_vmas(anon_dup);
  902. return -ENOMEM;
  903. }
  904. /*
  905. * vma_shrink() - Reduce an existing VMAs memory area
  906. * @vmi: The vma iterator
  907. * @vma: The VMA to modify
  908. * @start: The new start
  909. * @end: The new end
  910. *
  911. * Returns: 0 on success, -ENOMEM otherwise
  912. */
  913. int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
  914. unsigned long start, unsigned long end, pgoff_t pgoff)
  915. {
  916. struct vma_prepare vp;
  917. WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
  918. if (vma->vm_start < start)
  919. vma_iter_config(vmi, vma->vm_start, start);
  920. else
  921. vma_iter_config(vmi, end, vma->vm_end);
  922. if (vma_iter_prealloc(vmi, NULL))
  923. return -ENOMEM;
  924. vma_start_write(vma);
  925. init_vma_prep(&vp, vma);
  926. vma_prepare(&vp);
  927. vma_adjust_trans_huge(vma, start, end, 0);
  928. vma_iter_clear(vmi);
  929. vma_set_range(vma, start, end, pgoff);
  930. vma_complete(&vp, vmi, vma->vm_mm);
  931. validate_mm(vma->vm_mm);
  932. return 0;
  933. }
  934. static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
  935. struct ma_state *mas_detach, bool mm_wr_locked)
  936. {
  937. struct mmu_gather tlb;
  938. if (!vms->clear_ptes) /* Nothing to do */
  939. return;
  940. /*
  941. * We can free page tables without write-locking mmap_lock because VMAs
  942. * were isolated before we downgraded mmap_lock.
  943. */
  944. mas_set(mas_detach, 1);
  945. lru_add_drain();
  946. tlb_gather_mmu(&tlb, vms->vma->vm_mm);
  947. update_hiwater_rss(vms->vma->vm_mm);
  948. unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
  949. vms->vma_count, mm_wr_locked);
  950. mas_set(mas_detach, 1);
  951. /* start and end may be different if there is no prev or next vma. */
  952. free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
  953. vms->unmap_end, mm_wr_locked);
  954. tlb_finish_mmu(&tlb);
  955. vms->clear_ptes = false;
  956. }
  957. void vms_clean_up_area(struct vma_munmap_struct *vms,
  958. struct ma_state *mas_detach)
  959. {
  960. struct vm_area_struct *vma;
  961. if (!vms->nr_pages)
  962. return;
  963. vms_clear_ptes(vms, mas_detach, true);
  964. mas_set(mas_detach, 0);
  965. mas_for_each(mas_detach, vma, ULONG_MAX)
  966. vma_close(vma);
  967. }
  968. /*
  969. * vms_complete_munmap_vmas() - Finish the munmap() operation
  970. * @vms: The vma munmap struct
  971. * @mas_detach: The maple state of the detached vmas
  972. *
  973. * This updates the mm_struct, unmaps the region, frees the resources
  974. * used for the munmap() and may downgrade the lock - if requested. Everything
  975. * needed to be done once the vma maple tree is updated.
  976. */
  977. void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
  978. struct ma_state *mas_detach)
  979. {
  980. struct vm_area_struct *vma;
  981. struct mm_struct *mm;
  982. mm = current->mm;
  983. mm->map_count -= vms->vma_count;
  984. mm->locked_vm -= vms->locked_vm;
  985. if (vms->unlock)
  986. mmap_write_downgrade(mm);
  987. if (!vms->nr_pages)
  988. return;
  989. vms_clear_ptes(vms, mas_detach, !vms->unlock);
  990. /* Update high watermark before we lower total_vm */
  991. update_hiwater_vm(mm);
  992. /* Stat accounting */
  993. WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
  994. /* Paranoid bookkeeping */
  995. VM_WARN_ON(vms->exec_vm > mm->exec_vm);
  996. VM_WARN_ON(vms->stack_vm > mm->stack_vm);
  997. VM_WARN_ON(vms->data_vm > mm->data_vm);
  998. mm->exec_vm -= vms->exec_vm;
  999. mm->stack_vm -= vms->stack_vm;
  1000. mm->data_vm -= vms->data_vm;
  1001. /* Remove and clean up vmas */
  1002. mas_set(mas_detach, 0);
  1003. mas_for_each(mas_detach, vma, ULONG_MAX)
  1004. remove_vma(vma, /* unreachable = */ false);
  1005. vm_unacct_memory(vms->nr_accounted);
  1006. validate_mm(mm);
  1007. if (vms->unlock)
  1008. mmap_read_unlock(mm);
  1009. __mt_destroy(mas_detach->tree);
  1010. }
  1011. /*
  1012. * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
  1013. * for removal at a later date. Handles splitting first and last if necessary
  1014. * and marking the vmas as isolated.
  1015. *
  1016. * @vms: The vma munmap struct
  1017. * @mas_detach: The maple state tracking the detached tree
  1018. *
  1019. * Return: 0 on success, error otherwise
  1020. */
  1021. int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
  1022. struct ma_state *mas_detach)
  1023. {
  1024. struct vm_area_struct *next = NULL;
  1025. int error;
  1026. /*
  1027. * If we need to split any vma, do it now to save pain later.
  1028. * Does it split the first one?
  1029. */
  1030. if (vms->start > vms->vma->vm_start) {
  1031. /*
  1032. * Make sure that map_count on return from munmap() will
  1033. * not exceed its limit; but let map_count go just above
  1034. * its limit temporarily, to help free resources as expected.
  1035. */
  1036. if (vms->end < vms->vma->vm_end &&
  1037. vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
  1038. error = -ENOMEM;
  1039. goto map_count_exceeded;
  1040. }
  1041. /* Don't bother splitting the VMA if we can't unmap it anyway */
  1042. if (!can_modify_vma(vms->vma)) {
  1043. error = -EPERM;
  1044. goto start_split_failed;
  1045. }
  1046. error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
  1047. if (error)
  1048. goto start_split_failed;
  1049. }
  1050. vms->prev = vma_prev(vms->vmi);
  1051. if (vms->prev)
  1052. vms->unmap_start = vms->prev->vm_end;
  1053. /*
  1054. * Detach a range of VMAs from the mm. Using next as a temp variable as
  1055. * it is always overwritten.
  1056. */
  1057. for_each_vma_range(*(vms->vmi), next, vms->end) {
  1058. long nrpages;
  1059. if (!can_modify_vma(next)) {
  1060. error = -EPERM;
  1061. goto modify_vma_failed;
  1062. }
  1063. /* Does it split the end? */
  1064. if (next->vm_end > vms->end) {
  1065. error = __split_vma(vms->vmi, next, vms->end, 0);
  1066. if (error)
  1067. goto end_split_failed;
  1068. }
  1069. vma_start_write(next);
  1070. mas_set(mas_detach, vms->vma_count++);
  1071. error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
  1072. if (error)
  1073. goto munmap_gather_failed;
  1074. vma_mark_detached(next, true);
  1075. nrpages = vma_pages(next);
  1076. vms->nr_pages += nrpages;
  1077. if (next->vm_flags & VM_LOCKED)
  1078. vms->locked_vm += nrpages;
  1079. if (next->vm_flags & VM_ACCOUNT)
  1080. vms->nr_accounted += nrpages;
  1081. if (is_exec_mapping(next->vm_flags))
  1082. vms->exec_vm += nrpages;
  1083. else if (is_stack_mapping(next->vm_flags))
  1084. vms->stack_vm += nrpages;
  1085. else if (is_data_mapping(next->vm_flags))
  1086. vms->data_vm += nrpages;
  1087. if (unlikely(vms->uf)) {
  1088. /*
  1089. * If userfaultfd_unmap_prep returns an error the vmas
  1090. * will remain split, but userland will get a
  1091. * highly unexpected error anyway. This is no
  1092. * different than the case where the first of the two
  1093. * __split_vma fails, but we don't undo the first
  1094. * split, despite we could. This is unlikely enough
  1095. * failure that it's not worth optimizing it for.
  1096. */
  1097. error = userfaultfd_unmap_prep(next, vms->start,
  1098. vms->end, vms->uf);
  1099. if (error)
  1100. goto userfaultfd_error;
  1101. }
  1102. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  1103. BUG_ON(next->vm_start < vms->start);
  1104. BUG_ON(next->vm_start > vms->end);
  1105. #endif
  1106. }
  1107. vms->next = vma_next(vms->vmi);
  1108. if (vms->next)
  1109. vms->unmap_end = vms->next->vm_start;
  1110. #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
  1111. /* Make sure no VMAs are about to be lost. */
  1112. {
  1113. MA_STATE(test, mas_detach->tree, 0, 0);
  1114. struct vm_area_struct *vma_mas, *vma_test;
  1115. int test_count = 0;
  1116. vma_iter_set(vms->vmi, vms->start);
  1117. rcu_read_lock();
  1118. vma_test = mas_find(&test, vms->vma_count - 1);
  1119. for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
  1120. BUG_ON(vma_mas != vma_test);
  1121. test_count++;
  1122. vma_test = mas_next(&test, vms->vma_count - 1);
  1123. }
  1124. rcu_read_unlock();
  1125. BUG_ON(vms->vma_count != test_count);
  1126. }
  1127. #endif
  1128. while (vma_iter_addr(vms->vmi) > vms->start)
  1129. vma_iter_prev_range(vms->vmi);
  1130. vms->clear_ptes = true;
  1131. return 0;
  1132. userfaultfd_error:
  1133. munmap_gather_failed:
  1134. end_split_failed:
  1135. modify_vma_failed:
  1136. reattach_vmas(mas_detach);
  1137. start_split_failed:
  1138. map_count_exceeded:
  1139. return error;
  1140. }
  1141. /*
  1142. * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  1143. * @vmi: The vma iterator
  1144. * @vma: The starting vm_area_struct
  1145. * @mm: The mm_struct
  1146. * @start: The aligned start address to munmap.
  1147. * @end: The aligned end address to munmap.
  1148. * @uf: The userfaultfd list_head
  1149. * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
  1150. * success.
  1151. *
  1152. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1153. * lock held otherwise.
  1154. */
  1155. int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  1156. struct mm_struct *mm, unsigned long start, unsigned long end,
  1157. struct list_head *uf, bool unlock)
  1158. {
  1159. struct maple_tree mt_detach;
  1160. MA_STATE(mas_detach, &mt_detach, 0, 0);
  1161. mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
  1162. mt_on_stack(mt_detach);
  1163. struct vma_munmap_struct vms;
  1164. int error;
  1165. init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
  1166. error = vms_gather_munmap_vmas(&vms, &mas_detach);
  1167. if (error)
  1168. goto gather_failed;
  1169. error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
  1170. if (error)
  1171. goto clear_tree_failed;
  1172. /* Point of no return */
  1173. vms_complete_munmap_vmas(&vms, &mas_detach);
  1174. return 0;
  1175. clear_tree_failed:
  1176. reattach_vmas(&mas_detach);
  1177. gather_failed:
  1178. validate_mm(mm);
  1179. return error;
  1180. }
  1181. /*
  1182. * do_vmi_munmap() - munmap a given range.
  1183. * @vmi: The vma iterator
  1184. * @mm: The mm_struct
  1185. * @start: The start address to munmap
  1186. * @len: The length of the range to munmap
  1187. * @uf: The userfaultfd list_head
  1188. * @unlock: set to true if the user wants to drop the mmap_lock on success
  1189. *
  1190. * This function takes a @mas that is either pointing to the previous VMA or set
  1191. * to MA_START and sets it up to remove the mapping(s). The @len will be
  1192. * aligned.
  1193. *
  1194. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1195. * lock held otherwise.
  1196. */
  1197. int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
  1198. unsigned long start, size_t len, struct list_head *uf,
  1199. bool unlock)
  1200. {
  1201. unsigned long end;
  1202. struct vm_area_struct *vma;
  1203. if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
  1204. return -EINVAL;
  1205. end = start + PAGE_ALIGN(len);
  1206. if (end == start)
  1207. return -EINVAL;
  1208. /* Find the first overlapping VMA */
  1209. vma = vma_find(vmi, end);
  1210. if (!vma) {
  1211. if (unlock)
  1212. mmap_write_unlock(mm);
  1213. return 0;
  1214. }
  1215. return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
  1216. }
  1217. /*
  1218. * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
  1219. * context and anonymous VMA name within the range [start, end).
  1220. *
  1221. * As a result, we might be able to merge the newly modified VMA range with an
  1222. * adjacent VMA with identical properties.
  1223. *
  1224. * If no merge is possible and the range does not span the entirety of the VMA,
  1225. * we then need to split the VMA to accommodate the change.
  1226. *
  1227. * The function returns either the merged VMA, the original VMA if a split was
  1228. * required instead, or an error if the split failed.
  1229. */
  1230. static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
  1231. {
  1232. struct vm_area_struct *vma = vmg->vma;
  1233. struct vm_area_struct *merged;
  1234. /* First, try to merge. */
  1235. merged = vma_merge_existing_range(vmg);
  1236. if (merged)
  1237. return merged;
  1238. /* Split any preceding portion of the VMA. */
  1239. if (vma->vm_start < vmg->start) {
  1240. int err = split_vma(vmg->vmi, vma, vmg->start, 1);
  1241. if (err)
  1242. return ERR_PTR(err);
  1243. }
  1244. /* Split any trailing portion of the VMA. */
  1245. if (vma->vm_end > vmg->end) {
  1246. int err = split_vma(vmg->vmi, vma, vmg->end, 0);
  1247. if (err)
  1248. return ERR_PTR(err);
  1249. }
  1250. return vma;
  1251. }
  1252. struct vm_area_struct *vma_modify_flags(
  1253. struct vma_iterator *vmi, struct vm_area_struct *prev,
  1254. struct vm_area_struct *vma, unsigned long start, unsigned long end,
  1255. unsigned long new_flags)
  1256. {
  1257. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1258. vmg.flags = new_flags;
  1259. return vma_modify(&vmg);
  1260. }
  1261. struct vm_area_struct
  1262. *vma_modify_flags_name(struct vma_iterator *vmi,
  1263. struct vm_area_struct *prev,
  1264. struct vm_area_struct *vma,
  1265. unsigned long start,
  1266. unsigned long end,
  1267. unsigned long new_flags,
  1268. struct anon_vma_name *new_name)
  1269. {
  1270. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1271. vmg.flags = new_flags;
  1272. vmg.anon_name = new_name;
  1273. return vma_modify(&vmg);
  1274. }
  1275. struct vm_area_struct
  1276. *vma_modify_policy(struct vma_iterator *vmi,
  1277. struct vm_area_struct *prev,
  1278. struct vm_area_struct *vma,
  1279. unsigned long start, unsigned long end,
  1280. struct mempolicy *new_pol)
  1281. {
  1282. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1283. vmg.policy = new_pol;
  1284. return vma_modify(&vmg);
  1285. }
  1286. struct vm_area_struct
  1287. *vma_modify_flags_uffd(struct vma_iterator *vmi,
  1288. struct vm_area_struct *prev,
  1289. struct vm_area_struct *vma,
  1290. unsigned long start, unsigned long end,
  1291. unsigned long new_flags,
  1292. struct vm_userfaultfd_ctx new_ctx)
  1293. {
  1294. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1295. vmg.flags = new_flags;
  1296. vmg.uffd_ctx = new_ctx;
  1297. return vma_modify(&vmg);
  1298. }
  1299. /*
  1300. * Expand vma by delta bytes, potentially merging with an immediately adjacent
  1301. * VMA with identical properties.
  1302. */
  1303. struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
  1304. struct vm_area_struct *vma,
  1305. unsigned long delta)
  1306. {
  1307. VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
  1308. vmg.next = vma_iter_next_rewind(vmi, NULL);
  1309. vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
  1310. return vma_merge_new_range(&vmg);
  1311. }
  1312. void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
  1313. {
  1314. vb->count = 0;
  1315. }
  1316. static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
  1317. {
  1318. struct address_space *mapping;
  1319. int i;
  1320. mapping = vb->vmas[0]->vm_file->f_mapping;
  1321. i_mmap_lock_write(mapping);
  1322. for (i = 0; i < vb->count; i++) {
  1323. VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
  1324. __remove_shared_vm_struct(vb->vmas[i], mapping);
  1325. }
  1326. i_mmap_unlock_write(mapping);
  1327. unlink_file_vma_batch_init(vb);
  1328. }
  1329. void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
  1330. struct vm_area_struct *vma)
  1331. {
  1332. if (vma->vm_file == NULL)
  1333. return;
  1334. if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
  1335. vb->count == ARRAY_SIZE(vb->vmas))
  1336. unlink_file_vma_batch_process(vb);
  1337. vb->vmas[vb->count] = vma;
  1338. vb->count++;
  1339. }
  1340. void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
  1341. {
  1342. if (vb->count > 0)
  1343. unlink_file_vma_batch_process(vb);
  1344. }
  1345. /*
  1346. * Unlink a file-based vm structure from its interval tree, to hide
  1347. * vma from rmap and vmtruncate before freeing its page tables.
  1348. */
  1349. void unlink_file_vma(struct vm_area_struct *vma)
  1350. {
  1351. struct file *file = vma->vm_file;
  1352. if (file) {
  1353. struct address_space *mapping = file->f_mapping;
  1354. i_mmap_lock_write(mapping);
  1355. __remove_shared_vm_struct(vma, mapping);
  1356. i_mmap_unlock_write(mapping);
  1357. }
  1358. }
  1359. void vma_link_file(struct vm_area_struct *vma)
  1360. {
  1361. struct file *file = vma->vm_file;
  1362. struct address_space *mapping;
  1363. if (file) {
  1364. mapping = file->f_mapping;
  1365. i_mmap_lock_write(mapping);
  1366. __vma_link_file(vma, mapping);
  1367. i_mmap_unlock_write(mapping);
  1368. }
  1369. }
  1370. int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
  1371. {
  1372. VMA_ITERATOR(vmi, mm, 0);
  1373. vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
  1374. if (vma_iter_prealloc(&vmi, vma))
  1375. return -ENOMEM;
  1376. vma_start_write(vma);
  1377. vma_iter_store(&vmi, vma);
  1378. vma_link_file(vma);
  1379. mm->map_count++;
  1380. validate_mm(mm);
  1381. return 0;
  1382. }
  1383. /*
  1384. * Copy the vma structure to a new location in the same mm,
  1385. * prior to moving page table entries, to effect an mremap move.
  1386. */
  1387. struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  1388. unsigned long addr, unsigned long len, pgoff_t pgoff,
  1389. bool *need_rmap_locks)
  1390. {
  1391. struct vm_area_struct *vma = *vmap;
  1392. unsigned long vma_start = vma->vm_start;
  1393. struct mm_struct *mm = vma->vm_mm;
  1394. struct vm_area_struct *new_vma;
  1395. bool faulted_in_anon_vma = true;
  1396. VMA_ITERATOR(vmi, mm, addr);
  1397. VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
  1398. /*
  1399. * If anonymous vma has not yet been faulted, update new pgoff
  1400. * to match new location, to increase its chance of merging.
  1401. */
  1402. if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
  1403. pgoff = addr >> PAGE_SHIFT;
  1404. faulted_in_anon_vma = false;
  1405. }
  1406. new_vma = find_vma_prev(mm, addr, &vmg.prev);
  1407. if (new_vma && new_vma->vm_start < addr + len)
  1408. return NULL; /* should never get here */
  1409. vmg.vma = NULL; /* New VMA range. */
  1410. vmg.pgoff = pgoff;
  1411. vmg.next = vma_iter_next_rewind(&vmi, NULL);
  1412. new_vma = vma_merge_new_range(&vmg);
  1413. if (new_vma) {
  1414. /*
  1415. * Source vma may have been merged into new_vma
  1416. */
  1417. if (unlikely(vma_start >= new_vma->vm_start &&
  1418. vma_start < new_vma->vm_end)) {
  1419. /*
  1420. * The only way we can get a vma_merge with
  1421. * self during an mremap is if the vma hasn't
  1422. * been faulted in yet and we were allowed to
  1423. * reset the dst vma->vm_pgoff to the
  1424. * destination address of the mremap to allow
  1425. * the merge to happen. mremap must change the
  1426. * vm_pgoff linearity between src and dst vmas
  1427. * (in turn preventing a vma_merge) to be
  1428. * safe. It is only safe to keep the vm_pgoff
  1429. * linear if there are no pages mapped yet.
  1430. */
  1431. VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
  1432. *vmap = vma = new_vma;
  1433. }
  1434. *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
  1435. } else {
  1436. new_vma = vm_area_dup(vma);
  1437. if (!new_vma)
  1438. goto out;
  1439. vma_set_range(new_vma, addr, addr + len, pgoff);
  1440. if (vma_dup_policy(vma, new_vma))
  1441. goto out_free_vma;
  1442. if (anon_vma_clone(new_vma, vma))
  1443. goto out_free_mempol;
  1444. if (new_vma->vm_file)
  1445. get_file(new_vma->vm_file);
  1446. if (new_vma->vm_ops && new_vma->vm_ops->open)
  1447. new_vma->vm_ops->open(new_vma);
  1448. if (vma_link(mm, new_vma))
  1449. goto out_vma_link;
  1450. *need_rmap_locks = false;
  1451. }
  1452. return new_vma;
  1453. out_vma_link:
  1454. vma_close(new_vma);
  1455. if (new_vma->vm_file)
  1456. fput(new_vma->vm_file);
  1457. unlink_anon_vmas(new_vma);
  1458. out_free_mempol:
  1459. mpol_put(vma_policy(new_vma));
  1460. out_free_vma:
  1461. vm_area_free(new_vma);
  1462. out:
  1463. return NULL;
  1464. }
  1465. /*
  1466. * Rough compatibility check to quickly see if it's even worth looking
  1467. * at sharing an anon_vma.
  1468. *
  1469. * They need to have the same vm_file, and the flags can only differ
  1470. * in things that mprotect may change.
  1471. *
  1472. * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
  1473. * we can merge the two vma's. For example, we refuse to merge a vma if
  1474. * there is a vm_ops->close() function, because that indicates that the
  1475. * driver is doing some kind of reference counting. But that doesn't
  1476. * really matter for the anon_vma sharing case.
  1477. */
  1478. static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
  1479. {
  1480. return a->vm_end == b->vm_start &&
  1481. mpol_equal(vma_policy(a), vma_policy(b)) &&
  1482. a->vm_file == b->vm_file &&
  1483. !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
  1484. b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
  1485. }
  1486. /*
  1487. * Do some basic sanity checking to see if we can re-use the anon_vma
  1488. * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
  1489. * the same as 'old', the other will be the new one that is trying
  1490. * to share the anon_vma.
  1491. *
  1492. * NOTE! This runs with mmap_lock held for reading, so it is possible that
  1493. * the anon_vma of 'old' is concurrently in the process of being set up
  1494. * by another page fault trying to merge _that_. But that's ok: if it
  1495. * is being set up, that automatically means that it will be a singleton
  1496. * acceptable for merging, so we can do all of this optimistically. But
  1497. * we do that READ_ONCE() to make sure that we never re-load the pointer.
  1498. *
  1499. * IOW: that the "list_is_singular()" test on the anon_vma_chain only
  1500. * matters for the 'stable anon_vma' case (ie the thing we want to avoid
  1501. * is to return an anon_vma that is "complex" due to having gone through
  1502. * a fork).
  1503. *
  1504. * We also make sure that the two vma's are compatible (adjacent,
  1505. * and with the same memory policies). That's all stable, even with just
  1506. * a read lock on the mmap_lock.
  1507. */
  1508. static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
  1509. struct vm_area_struct *a,
  1510. struct vm_area_struct *b)
  1511. {
  1512. if (anon_vma_compatible(a, b)) {
  1513. struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
  1514. if (anon_vma && list_is_singular(&old->anon_vma_chain))
  1515. return anon_vma;
  1516. }
  1517. return NULL;
  1518. }
  1519. /*
  1520. * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  1521. * neighbouring vmas for a suitable anon_vma, before it goes off
  1522. * to allocate a new anon_vma. It checks because a repetitive
  1523. * sequence of mprotects and faults may otherwise lead to distinct
  1524. * anon_vmas being allocated, preventing vma merge in subsequent
  1525. * mprotect.
  1526. */
  1527. struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  1528. {
  1529. struct anon_vma *anon_vma = NULL;
  1530. struct vm_area_struct *prev, *next;
  1531. VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
  1532. /* Try next first. */
  1533. next = vma_iter_load(&vmi);
  1534. if (next) {
  1535. anon_vma = reusable_anon_vma(next, vma, next);
  1536. if (anon_vma)
  1537. return anon_vma;
  1538. }
  1539. prev = vma_prev(&vmi);
  1540. VM_BUG_ON_VMA(prev != vma, vma);
  1541. prev = vma_prev(&vmi);
  1542. /* Try prev next. */
  1543. if (prev)
  1544. anon_vma = reusable_anon_vma(prev, prev, vma);
  1545. /*
  1546. * We might reach here with anon_vma == NULL if we can't find
  1547. * any reusable anon_vma.
  1548. * There's no absolute need to look only at touching neighbours:
  1549. * we could search further afield for "compatible" anon_vmas.
  1550. * But it would probably just be a waste of time searching,
  1551. * or lead to too many vmas hanging off the same anon_vma.
  1552. * We're trying to allow mprotect remerging later on,
  1553. * not trying to minimize memory used for anon_vmas.
  1554. */
  1555. return anon_vma;
  1556. }
  1557. static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
  1558. {
  1559. return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
  1560. }
  1561. static bool vma_is_shared_writable(struct vm_area_struct *vma)
  1562. {
  1563. return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
  1564. (VM_WRITE | VM_SHARED);
  1565. }
  1566. static bool vma_fs_can_writeback(struct vm_area_struct *vma)
  1567. {
  1568. /* No managed pages to writeback. */
  1569. if (vma->vm_flags & VM_PFNMAP)
  1570. return false;
  1571. return vma->vm_file && vma->vm_file->f_mapping &&
  1572. mapping_can_writeback(vma->vm_file->f_mapping);
  1573. }
  1574. /*
  1575. * Does this VMA require the underlying folios to have their dirty state
  1576. * tracked?
  1577. */
  1578. bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
  1579. {
  1580. /* Only shared, writable VMAs require dirty tracking. */
  1581. if (!vma_is_shared_writable(vma))
  1582. return false;
  1583. /* Does the filesystem need to be notified? */
  1584. if (vm_ops_needs_writenotify(vma->vm_ops))
  1585. return true;
  1586. /*
  1587. * Even if the filesystem doesn't indicate a need for writenotify, if it
  1588. * can writeback, dirty tracking is still required.
  1589. */
  1590. return vma_fs_can_writeback(vma);
  1591. }
  1592. /*
  1593. * Some shared mappings will want the pages marked read-only
  1594. * to track write events. If so, we'll downgrade vm_page_prot
  1595. * to the private version (using protection_map[] without the
  1596. * VM_SHARED bit).
  1597. */
  1598. bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
  1599. {
  1600. /* If it was private or non-writable, the write bit is already clear */
  1601. if (!vma_is_shared_writable(vma))
  1602. return false;
  1603. /* The backer wishes to know when pages are first written to? */
  1604. if (vm_ops_needs_writenotify(vma->vm_ops))
  1605. return true;
  1606. /* The open routine did something to the protections that pgprot_modify
  1607. * won't preserve? */
  1608. if (pgprot_val(vm_page_prot) !=
  1609. pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
  1610. return false;
  1611. /*
  1612. * Do we need to track softdirty? hugetlb does not support softdirty
  1613. * tracking yet.
  1614. */
  1615. if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
  1616. return true;
  1617. /* Do we need write faults for uffd-wp tracking? */
  1618. if (userfaultfd_wp(vma))
  1619. return true;
  1620. /* Can the mapping track the dirty pages? */
  1621. return vma_fs_can_writeback(vma);
  1622. }
  1623. static DEFINE_MUTEX(mm_all_locks_mutex);
  1624. static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
  1625. {
  1626. if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1627. /*
  1628. * The LSB of head.next can't change from under us
  1629. * because we hold the mm_all_locks_mutex.
  1630. */
  1631. down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
  1632. /*
  1633. * We can safely modify head.next after taking the
  1634. * anon_vma->root->rwsem. If some other vma in this mm shares
  1635. * the same anon_vma we won't take it again.
  1636. *
  1637. * No need of atomic instructions here, head.next
  1638. * can't change from under us thanks to the
  1639. * anon_vma->root->rwsem.
  1640. */
  1641. if (__test_and_set_bit(0, (unsigned long *)
  1642. &anon_vma->root->rb_root.rb_root.rb_node))
  1643. BUG();
  1644. }
  1645. }
  1646. static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  1647. {
  1648. if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1649. /*
  1650. * AS_MM_ALL_LOCKS can't change from under us because
  1651. * we hold the mm_all_locks_mutex.
  1652. *
  1653. * Operations on ->flags have to be atomic because
  1654. * even if AS_MM_ALL_LOCKS is stable thanks to the
  1655. * mm_all_locks_mutex, there may be other cpus
  1656. * changing other bitflags in parallel to us.
  1657. */
  1658. if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
  1659. BUG();
  1660. down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
  1661. }
  1662. }
  1663. /*
  1664. * This operation locks against the VM for all pte/vma/mm related
  1665. * operations that could ever happen on a certain mm. This includes
  1666. * vmtruncate, try_to_unmap, and all page faults.
  1667. *
  1668. * The caller must take the mmap_lock in write mode before calling
  1669. * mm_take_all_locks(). The caller isn't allowed to release the
  1670. * mmap_lock until mm_drop_all_locks() returns.
  1671. *
  1672. * mmap_lock in write mode is required in order to block all operations
  1673. * that could modify pagetables and free pages without need of
  1674. * altering the vma layout. It's also needed in write mode to avoid new
  1675. * anon_vmas to be associated with existing vmas.
  1676. *
  1677. * A single task can't take more than one mm_take_all_locks() in a row
  1678. * or it would deadlock.
  1679. *
  1680. * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  1681. * mapping->flags avoid to take the same lock twice, if more than one
  1682. * vma in this mm is backed by the same anon_vma or address_space.
  1683. *
  1684. * We take locks in following order, accordingly to comment at beginning
  1685. * of mm/rmap.c:
  1686. * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
  1687. * hugetlb mapping);
  1688. * - all vmas marked locked
  1689. * - all i_mmap_rwsem locks;
  1690. * - all anon_vma->rwseml
  1691. *
  1692. * We can take all locks within these types randomly because the VM code
  1693. * doesn't nest them and we protected from parallel mm_take_all_locks() by
  1694. * mm_all_locks_mutex.
  1695. *
  1696. * mm_take_all_locks() and mm_drop_all_locks are expensive operations
  1697. * that may have to take thousand of locks.
  1698. *
  1699. * mm_take_all_locks() can fail if it's interrupted by signals.
  1700. */
  1701. int mm_take_all_locks(struct mm_struct *mm)
  1702. {
  1703. struct vm_area_struct *vma;
  1704. struct anon_vma_chain *avc;
  1705. VMA_ITERATOR(vmi, mm, 0);
  1706. mmap_assert_write_locked(mm);
  1707. mutex_lock(&mm_all_locks_mutex);
  1708. /*
  1709. * vma_start_write() does not have a complement in mm_drop_all_locks()
  1710. * because vma_start_write() is always asymmetrical; it marks a VMA as
  1711. * being written to until mmap_write_unlock() or mmap_write_downgrade()
  1712. * is reached.
  1713. */
  1714. for_each_vma(vmi, vma) {
  1715. if (signal_pending(current))
  1716. goto out_unlock;
  1717. vma_start_write(vma);
  1718. }
  1719. vma_iter_init(&vmi, mm, 0);
  1720. for_each_vma(vmi, vma) {
  1721. if (signal_pending(current))
  1722. goto out_unlock;
  1723. if (vma->vm_file && vma->vm_file->f_mapping &&
  1724. is_vm_hugetlb_page(vma))
  1725. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1726. }
  1727. vma_iter_init(&vmi, mm, 0);
  1728. for_each_vma(vmi, vma) {
  1729. if (signal_pending(current))
  1730. goto out_unlock;
  1731. if (vma->vm_file && vma->vm_file->f_mapping &&
  1732. !is_vm_hugetlb_page(vma))
  1733. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1734. }
  1735. vma_iter_init(&vmi, mm, 0);
  1736. for_each_vma(vmi, vma) {
  1737. if (signal_pending(current))
  1738. goto out_unlock;
  1739. if (vma->anon_vma)
  1740. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  1741. vm_lock_anon_vma(mm, avc->anon_vma);
  1742. }
  1743. return 0;
  1744. out_unlock:
  1745. mm_drop_all_locks(mm);
  1746. return -EINTR;
  1747. }
  1748. static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
  1749. {
  1750. if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1751. /*
  1752. * The LSB of head.next can't change to 0 from under
  1753. * us because we hold the mm_all_locks_mutex.
  1754. *
  1755. * We must however clear the bitflag before unlocking
  1756. * the vma so the users using the anon_vma->rb_root will
  1757. * never see our bitflag.
  1758. *
  1759. * No need of atomic instructions here, head.next
  1760. * can't change from under us until we release the
  1761. * anon_vma->root->rwsem.
  1762. */
  1763. if (!__test_and_clear_bit(0, (unsigned long *)
  1764. &anon_vma->root->rb_root.rb_root.rb_node))
  1765. BUG();
  1766. anon_vma_unlock_write(anon_vma);
  1767. }
  1768. }
  1769. static void vm_unlock_mapping(struct address_space *mapping)
  1770. {
  1771. if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1772. /*
  1773. * AS_MM_ALL_LOCKS can't change to 0 from under us
  1774. * because we hold the mm_all_locks_mutex.
  1775. */
  1776. i_mmap_unlock_write(mapping);
  1777. if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
  1778. &mapping->flags))
  1779. BUG();
  1780. }
  1781. }
  1782. /*
  1783. * The mmap_lock cannot be released by the caller until
  1784. * mm_drop_all_locks() returns.
  1785. */
  1786. void mm_drop_all_locks(struct mm_struct *mm)
  1787. {
  1788. struct vm_area_struct *vma;
  1789. struct anon_vma_chain *avc;
  1790. VMA_ITERATOR(vmi, mm, 0);
  1791. mmap_assert_write_locked(mm);
  1792. BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
  1793. for_each_vma(vmi, vma) {
  1794. if (vma->anon_vma)
  1795. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  1796. vm_unlock_anon_vma(avc->anon_vma);
  1797. if (vma->vm_file && vma->vm_file->f_mapping)
  1798. vm_unlock_mapping(vma->vm_file->f_mapping);
  1799. }
  1800. mutex_unlock(&mm_all_locks_mutex);
  1801. }