vma.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * vma.h
  4. *
  5. * Core VMA manipulation API implemented in vma.c.
  6. */
  7. #ifndef __MM_VMA_H
  8. #define __MM_VMA_H
  9. /*
  10. * VMA lock generalization
  11. */
  12. struct vma_prepare {
  13. struct vm_area_struct *vma;
  14. struct vm_area_struct *adj_next;
  15. struct file *file;
  16. struct address_space *mapping;
  17. struct anon_vma *anon_vma;
  18. struct vm_area_struct *insert;
  19. struct vm_area_struct *remove;
  20. struct vm_area_struct *remove2;
  21. };
  22. struct unlink_vma_file_batch {
  23. int count;
  24. struct vm_area_struct *vmas[8];
  25. };
  26. /*
  27. * vma munmap operation
  28. */
  29. struct vma_munmap_struct {
  30. struct vma_iterator *vmi;
  31. struct vm_area_struct *vma; /* The first vma to munmap */
  32. struct vm_area_struct *prev; /* vma before the munmap area */
  33. struct vm_area_struct *next; /* vma after the munmap area */
  34. struct list_head *uf; /* Userfaultfd list_head */
  35. unsigned long start; /* Aligned start addr (inclusive) */
  36. unsigned long end; /* Aligned end addr (exclusive) */
  37. unsigned long unmap_start; /* Unmap PTE start */
  38. unsigned long unmap_end; /* Unmap PTE end */
  39. int vma_count; /* Number of vmas that will be removed */
  40. bool unlock; /* Unlock after the munmap */
  41. bool clear_ptes; /* If there are outstanding PTE to be cleared */
  42. /* 2 byte hole */
  43. unsigned long nr_pages; /* Number of pages being removed */
  44. unsigned long locked_vm; /* Number of locked pages */
  45. unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
  46. unsigned long exec_vm;
  47. unsigned long stack_vm;
  48. unsigned long data_vm;
  49. };
  50. enum vma_merge_state {
  51. VMA_MERGE_START,
  52. VMA_MERGE_ERROR_NOMEM,
  53. VMA_MERGE_NOMERGE,
  54. VMA_MERGE_SUCCESS,
  55. };
  56. enum vma_merge_flags {
  57. VMG_FLAG_DEFAULT = 0,
  58. /*
  59. * If we can expand, simply do so. We know there is nothing to merge to
  60. * the right. Does not reset state upon failure to merge. The VMA
  61. * iterator is assumed to be positioned at the previous VMA, rather than
  62. * at the gap.
  63. */
  64. VMG_FLAG_JUST_EXPAND = 1 << 0,
  65. };
  66. /* Represents a VMA merge operation. */
  67. struct vma_merge_struct {
  68. struct mm_struct *mm;
  69. struct vma_iterator *vmi;
  70. pgoff_t pgoff;
  71. struct vm_area_struct *prev;
  72. struct vm_area_struct *next; /* Modified by vma_merge(). */
  73. struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
  74. unsigned long start;
  75. unsigned long end;
  76. unsigned long flags;
  77. struct file *file;
  78. struct anon_vma *anon_vma;
  79. struct mempolicy *policy;
  80. struct vm_userfaultfd_ctx uffd_ctx;
  81. struct anon_vma_name *anon_name;
  82. enum vma_merge_flags merge_flags;
  83. enum vma_merge_state state;
  84. };
  85. static inline bool vmg_nomem(struct vma_merge_struct *vmg)
  86. {
  87. return vmg->state == VMA_MERGE_ERROR_NOMEM;
  88. }
  89. /* Assumes addr >= vma->vm_start. */
  90. static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
  91. unsigned long addr)
  92. {
  93. return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
  94. }
  95. #define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \
  96. struct vma_merge_struct name = { \
  97. .mm = mm_, \
  98. .vmi = vmi_, \
  99. .start = start_, \
  100. .end = end_, \
  101. .flags = flags_, \
  102. .pgoff = pgoff_, \
  103. .state = VMA_MERGE_START, \
  104. .merge_flags = VMG_FLAG_DEFAULT, \
  105. }
  106. #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
  107. struct vma_merge_struct name = { \
  108. .mm = vma_->vm_mm, \
  109. .vmi = vmi_, \
  110. .prev = prev_, \
  111. .next = NULL, \
  112. .vma = vma_, \
  113. .start = start_, \
  114. .end = end_, \
  115. .flags = vma_->vm_flags, \
  116. .pgoff = vma_pgoff_offset(vma_, start_), \
  117. .file = vma_->vm_file, \
  118. .anon_vma = vma_->anon_vma, \
  119. .policy = vma_policy(vma_), \
  120. .uffd_ctx = vma_->vm_userfaultfd_ctx, \
  121. .anon_name = anon_vma_name(vma_), \
  122. .state = VMA_MERGE_START, \
  123. .merge_flags = VMG_FLAG_DEFAULT, \
  124. }
  125. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  126. void validate_mm(struct mm_struct *mm);
  127. #else
  128. #define validate_mm(mm) do { } while (0)
  129. #endif
  130. /* Required for expand_downwards(). */
  131. void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
  132. /* Required for expand_downwards(). */
  133. void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
  134. int vma_expand(struct vma_merge_struct *vmg);
  135. int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
  136. unsigned long start, unsigned long end, pgoff_t pgoff);
  137. static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
  138. struct vm_area_struct *vma, gfp_t gfp)
  139. {
  140. if (vmi->mas.status != ma_start &&
  141. ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
  142. vma_iter_invalidate(vmi);
  143. __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
  144. mas_store_gfp(&vmi->mas, vma, gfp);
  145. if (unlikely(mas_is_err(&vmi->mas)))
  146. return -ENOMEM;
  147. return 0;
  148. }
  149. #ifdef CONFIG_MMU
  150. /*
  151. * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
  152. * @vms: The vma munmap struct
  153. * @vmi: The vma iterator
  154. * @vma: The first vm_area_struct to munmap
  155. * @start: The aligned start address to munmap
  156. * @end: The aligned end address to munmap
  157. * @uf: The userfaultfd list_head
  158. * @unlock: Unlock after the operation. Only unlocked on success
  159. */
  160. static inline void init_vma_munmap(struct vma_munmap_struct *vms,
  161. struct vma_iterator *vmi, struct vm_area_struct *vma,
  162. unsigned long start, unsigned long end, struct list_head *uf,
  163. bool unlock)
  164. {
  165. vms->vmi = vmi;
  166. vms->vma = vma;
  167. if (vma) {
  168. vms->start = start;
  169. vms->end = end;
  170. } else {
  171. vms->start = vms->end = 0;
  172. }
  173. vms->unlock = unlock;
  174. vms->uf = uf;
  175. vms->vma_count = 0;
  176. vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
  177. vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
  178. vms->unmap_start = FIRST_USER_ADDRESS;
  179. vms->unmap_end = USER_PGTABLES_CEILING;
  180. vms->clear_ptes = false;
  181. }
  182. #endif
  183. int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
  184. struct ma_state *mas_detach);
  185. void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
  186. struct ma_state *mas_detach);
  187. void vms_clean_up_area(struct vma_munmap_struct *vms,
  188. struct ma_state *mas_detach);
  189. /*
  190. * reattach_vmas() - Undo any munmap work and free resources
  191. * @mas_detach: The maple state with the detached maple tree
  192. *
  193. * Reattach any detached vmas and free up the maple tree used to track the vmas.
  194. */
  195. static inline void reattach_vmas(struct ma_state *mas_detach)
  196. {
  197. struct vm_area_struct *vma;
  198. mas_set(mas_detach, 0);
  199. mas_for_each(mas_detach, vma, ULONG_MAX)
  200. vma_mark_detached(vma, false);
  201. __mt_destroy(mas_detach->tree);
  202. }
  203. /*
  204. * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
  205. * operation.
  206. * @vms: The vma unmap structure
  207. * @mas_detach: The maple state with the detached maple tree
  208. *
  209. * Reattach any detached vmas, free up the maple tree used to track the vmas.
  210. * If that's not possible because the ptes are cleared (and vm_ops->closed() may
  211. * have been called), then a NULL is written over the vmas and the vmas are
  212. * removed (munmap() completed).
  213. */
  214. static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
  215. struct ma_state *mas_detach)
  216. {
  217. struct ma_state *mas = &vms->vmi->mas;
  218. if (!vms->nr_pages)
  219. return;
  220. if (vms->clear_ptes)
  221. return reattach_vmas(mas_detach);
  222. /*
  223. * Aborting cannot just call the vm_ops open() because they are often
  224. * not symmetrical and state data has been lost. Resort to the old
  225. * failure method of leaving a gap where the MAP_FIXED mapping failed.
  226. */
  227. mas_set_range(mas, vms->start, vms->end - 1);
  228. mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
  229. /* Clean up the insertion of the unfortunate gap */
  230. vms_complete_munmap_vmas(vms, mas_detach);
  231. }
  232. int
  233. do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  234. struct mm_struct *mm, unsigned long start,
  235. unsigned long end, struct list_head *uf, bool unlock);
  236. int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
  237. unsigned long start, size_t len, struct list_head *uf,
  238. bool unlock);
  239. void remove_vma(struct vm_area_struct *vma, bool unreachable);
  240. void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
  241. struct vm_area_struct *prev, struct vm_area_struct *next);
  242. /* We are about to modify the VMA's flags. */
  243. struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
  244. struct vm_area_struct *prev, struct vm_area_struct *vma,
  245. unsigned long start, unsigned long end,
  246. unsigned long new_flags);
  247. /* We are about to modify the VMA's flags and/or anon_name. */
  248. struct vm_area_struct
  249. *vma_modify_flags_name(struct vma_iterator *vmi,
  250. struct vm_area_struct *prev,
  251. struct vm_area_struct *vma,
  252. unsigned long start,
  253. unsigned long end,
  254. unsigned long new_flags,
  255. struct anon_vma_name *new_name);
  256. /* We are about to modify the VMA's memory policy. */
  257. struct vm_area_struct
  258. *vma_modify_policy(struct vma_iterator *vmi,
  259. struct vm_area_struct *prev,
  260. struct vm_area_struct *vma,
  261. unsigned long start, unsigned long end,
  262. struct mempolicy *new_pol);
  263. /* We are about to modify the VMA's flags and/or uffd context. */
  264. struct vm_area_struct
  265. *vma_modify_flags_uffd(struct vma_iterator *vmi,
  266. struct vm_area_struct *prev,
  267. struct vm_area_struct *vma,
  268. unsigned long start, unsigned long end,
  269. unsigned long new_flags,
  270. struct vm_userfaultfd_ctx new_ctx);
  271. struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
  272. struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
  273. struct vm_area_struct *vma,
  274. unsigned long delta);
  275. void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
  276. void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);
  277. void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
  278. struct vm_area_struct *vma);
  279. void unlink_file_vma(struct vm_area_struct *vma);
  280. void vma_link_file(struct vm_area_struct *vma);
  281. int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);
  282. struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  283. unsigned long addr, unsigned long len, pgoff_t pgoff,
  284. bool *need_rmap_locks);
  285. struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);
  286. bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
  287. bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
  288. int mm_take_all_locks(struct mm_struct *mm);
  289. void mm_drop_all_locks(struct mm_struct *mm);
  290. static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
  291. {
  292. /*
  293. * We want to check manually if we can change individual PTEs writable
  294. * if we can't do that automatically for all PTEs in a mapping. For
  295. * private mappings, that's always the case when we have write
  296. * permissions as we properly have to handle COW.
  297. */
  298. if (vma->vm_flags & VM_SHARED)
  299. return vma_wants_writenotify(vma, vma->vm_page_prot);
  300. return !!(vma->vm_flags & VM_WRITE);
  301. }
  302. #ifdef CONFIG_MMU
  303. static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
  304. {
  305. return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
  306. }
  307. #endif
  308. static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
  309. unsigned long min)
  310. {
  311. return mas_prev(&vmi->mas, min);
  312. }
  313. /*
  314. * These three helpers classifies VMAs for virtual memory accounting.
  315. */
  316. /*
  317. * Executable code area - executable, not writable, not stack
  318. */
  319. static inline bool is_exec_mapping(vm_flags_t flags)
  320. {
  321. return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
  322. }
  323. /*
  324. * Stack area (including shadow stacks)
  325. *
  326. * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
  327. * do_mmap() forbids all other combinations.
  328. */
  329. static inline bool is_stack_mapping(vm_flags_t flags)
  330. {
  331. return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
  332. }
  333. /*
  334. * Data area - private, writable, not stack
  335. */
  336. static inline bool is_data_mapping(vm_flags_t flags)
  337. {
  338. return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
  339. }
  340. static inline void vma_iter_config(struct vma_iterator *vmi,
  341. unsigned long index, unsigned long last)
  342. {
  343. __mas_set_range(&vmi->mas, index, last - 1);
  344. }
  345. static inline void vma_iter_reset(struct vma_iterator *vmi)
  346. {
  347. mas_reset(&vmi->mas);
  348. }
  349. static inline
  350. struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
  351. {
  352. return mas_prev_range(&vmi->mas, min);
  353. }
  354. static inline
  355. struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
  356. {
  357. return mas_next_range(&vmi->mas, max);
  358. }
  359. static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
  360. unsigned long max, unsigned long size)
  361. {
  362. return mas_empty_area(&vmi->mas, min, max - 1, size);
  363. }
  364. static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
  365. unsigned long max, unsigned long size)
  366. {
  367. return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
  368. }
  369. /*
  370. * VMA Iterator functions shared between nommu and mmap
  371. */
  372. static inline int vma_iter_prealloc(struct vma_iterator *vmi,
  373. struct vm_area_struct *vma)
  374. {
  375. return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
  376. }
  377. static inline void vma_iter_clear(struct vma_iterator *vmi)
  378. {
  379. mas_store_prealloc(&vmi->mas, NULL);
  380. }
  381. static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
  382. {
  383. return mas_walk(&vmi->mas);
  384. }
  385. /* Store a VMA with preallocated memory */
  386. static inline void vma_iter_store(struct vma_iterator *vmi,
  387. struct vm_area_struct *vma)
  388. {
  389. #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
  390. if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
  391. vmi->mas.index > vma->vm_start)) {
  392. pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
  393. vmi->mas.index, vma->vm_start, vma->vm_start,
  394. vma->vm_end, vmi->mas.index, vmi->mas.last);
  395. }
  396. if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
  397. vmi->mas.last < vma->vm_start)) {
  398. pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
  399. vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
  400. vmi->mas.index, vmi->mas.last);
  401. }
  402. #endif
  403. if (vmi->mas.status != ma_start &&
  404. ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
  405. vma_iter_invalidate(vmi);
  406. __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
  407. mas_store_prealloc(&vmi->mas, vma);
  408. }
  409. static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
  410. {
  411. return vmi->mas.index;
  412. }
  413. static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
  414. {
  415. return vmi->mas.last + 1;
  416. }
  417. static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
  418. unsigned long count)
  419. {
  420. return mas_expected_entries(&vmi->mas, count);
  421. }
  422. static inline
  423. struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
  424. {
  425. return mas_prev_range(&vmi->mas, 0);
  426. }
  427. /*
  428. * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
  429. * if no previous VMA, to index 0.
  430. */
  431. static inline
  432. struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
  433. struct vm_area_struct **pprev)
  434. {
  435. struct vm_area_struct *next = vma_next(vmi);
  436. struct vm_area_struct *prev = vma_prev(vmi);
  437. /*
  438. * Consider the case where no previous VMA exists. We advance to the
  439. * next VMA, skipping any gap, then rewind to the start of the range.
  440. *
  441. * If we were to unconditionally advance to the next range we'd wind up
  442. * at the next VMA again, so we check to ensure there is a previous VMA
  443. * to skip over.
  444. */
  445. if (prev)
  446. vma_iter_next_range(vmi);
  447. if (pprev)
  448. *pprev = prev;
  449. return next;
  450. }
  451. #ifdef CONFIG_64BIT
  452. static inline bool vma_is_sealed(struct vm_area_struct *vma)
  453. {
  454. return (vma->vm_flags & VM_SEALED);
  455. }
  456. /*
  457. * check if a vma is sealed for modification.
  458. * return true, if modification is allowed.
  459. */
  460. static inline bool can_modify_vma(struct vm_area_struct *vma)
  461. {
  462. if (unlikely(vma_is_sealed(vma)))
  463. return false;
  464. return true;
  465. }
  466. bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
  467. #else
  468. static inline bool can_modify_vma(struct vm_area_struct *vma)
  469. {
  470. return true;
  471. }
  472. static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
  473. {
  474. return true;
  475. }
  476. #endif
  477. #endif /* __MM_VMA_H */