mseal.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Implement mseal() syscall.
  4. *
  5. * Copyright (c) 2023,2024 Google, Inc.
  6. *
  7. * Author: Jeff Xu <jeffxu@chromium.org>
  8. */
  9. #include <linux/mempolicy.h>
  10. #include <linux/mman.h>
  11. #include <linux/mm.h>
  12. #include <linux/mm_inline.h>
  13. #include <linux/mmu_context.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/sched.h>
  16. #include "internal.h"
  17. static inline void set_vma_sealed(struct vm_area_struct *vma)
  18. {
  19. vm_flags_set(vma, VM_SEALED);
  20. }
  21. static bool is_madv_discard(int behavior)
  22. {
  23. switch (behavior) {
  24. case MADV_FREE:
  25. case MADV_DONTNEED:
  26. case MADV_DONTNEED_LOCKED:
  27. case MADV_REMOVE:
  28. case MADV_DONTFORK:
  29. case MADV_WIPEONFORK:
  30. return true;
  31. }
  32. return false;
  33. }
  34. static bool is_ro_anon(struct vm_area_struct *vma)
  35. {
  36. /* check anonymous mapping. */
  37. if (vma->vm_file || vma->vm_flags & VM_SHARED)
  38. return false;
  39. /*
  40. * check for non-writable:
  41. * PROT=RO or PKRU is not writeable.
  42. */
  43. if (!(vma->vm_flags & VM_WRITE) ||
  44. !arch_vma_access_permitted(vma, true, false, false))
  45. return true;
  46. return false;
  47. }
  48. /*
  49. * Check if a vma is allowed to be modified by madvise.
  50. */
  51. bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
  52. {
  53. if (!is_madv_discard(behavior))
  54. return true;
  55. if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
  56. return false;
  57. /* Allow by default. */
  58. return true;
  59. }
  60. static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
  61. struct vm_area_struct **prev, unsigned long start,
  62. unsigned long end, vm_flags_t newflags)
  63. {
  64. int ret = 0;
  65. vm_flags_t oldflags = vma->vm_flags;
  66. if (newflags == oldflags)
  67. goto out;
  68. vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
  69. if (IS_ERR(vma)) {
  70. ret = PTR_ERR(vma);
  71. goto out;
  72. }
  73. set_vma_sealed(vma);
  74. out:
  75. *prev = vma;
  76. return ret;
  77. }
  78. /*
  79. * Check for do_mseal:
  80. * 1> start is part of a valid vma.
  81. * 2> end is part of a valid vma.
  82. * 3> No gap (unallocated address) between start and end.
  83. * 4> map is sealable.
  84. */
  85. static int check_mm_seal(unsigned long start, unsigned long end)
  86. {
  87. struct vm_area_struct *vma;
  88. unsigned long nstart = start;
  89. VMA_ITERATOR(vmi, current->mm, start);
  90. /* going through each vma to check. */
  91. for_each_vma_range(vmi, vma, end) {
  92. if (vma->vm_start > nstart)
  93. /* unallocated memory found. */
  94. return -ENOMEM;
  95. if (vma->vm_end >= end)
  96. return 0;
  97. nstart = vma->vm_end;
  98. }
  99. return -ENOMEM;
  100. }
  101. /*
  102. * Apply sealing.
  103. */
  104. static int apply_mm_seal(unsigned long start, unsigned long end)
  105. {
  106. unsigned long nstart;
  107. struct vm_area_struct *vma, *prev;
  108. VMA_ITERATOR(vmi, current->mm, start);
  109. vma = vma_iter_load(&vmi);
  110. /*
  111. * Note: check_mm_seal should already checked ENOMEM case.
  112. * so vma should not be null, same for the other ENOMEM cases.
  113. */
  114. prev = vma_prev(&vmi);
  115. if (start > vma->vm_start)
  116. prev = vma;
  117. nstart = start;
  118. for_each_vma_range(vmi, vma, end) {
  119. int error;
  120. unsigned long tmp;
  121. vm_flags_t newflags;
  122. newflags = vma->vm_flags | VM_SEALED;
  123. tmp = vma->vm_end;
  124. if (tmp > end)
  125. tmp = end;
  126. error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
  127. if (error)
  128. return error;
  129. nstart = vma_iter_end(&vmi);
  130. }
  131. return 0;
  132. }
  133. /*
  134. * mseal(2) seals the VM's meta data from
  135. * selected syscalls.
  136. *
  137. * addr/len: VM address range.
  138. *
  139. * The address range by addr/len must meet:
  140. * start (addr) must be in a valid VMA.
  141. * end (addr + len) must be in a valid VMA.
  142. * no gap (unallocated memory) between start and end.
  143. * start (addr) must be page aligned.
  144. *
  145. * len: len will be page aligned implicitly.
  146. *
  147. * Below VMA operations are blocked after sealing.
  148. * 1> Unmapping, moving to another location, and shrinking
  149. * the size, via munmap() and mremap(), can leave an empty
  150. * space, therefore can be replaced with a VMA with a new
  151. * set of attributes.
  152. * 2> Moving or expanding a different vma into the current location,
  153. * via mremap().
  154. * 3> Modifying a VMA via mmap(MAP_FIXED).
  155. * 4> Size expansion, via mremap(), does not appear to pose any
  156. * specific risks to sealed VMAs. It is included anyway because
  157. * the use case is unclear. In any case, users can rely on
  158. * merging to expand a sealed VMA.
  159. * 5> mprotect and pkey_mprotect.
  160. * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
  161. * for anonymous memory, when users don't have write permission to the
  162. * memory. Those behaviors can alter region contents by discarding pages,
  163. * effectively a memset(0) for anonymous memory.
  164. *
  165. * flags: reserved.
  166. *
  167. * return values:
  168. * zero: success.
  169. * -EINVAL:
  170. * invalid input flags.
  171. * start address is not page aligned.
  172. * Address arange (start + len) overflow.
  173. * -ENOMEM:
  174. * addr is not a valid address (not allocated).
  175. * end (start + len) is not a valid address.
  176. * a gap (unallocated memory) between start and end.
  177. * -EPERM:
  178. * - In 32 bit architecture, sealing is not supported.
  179. * Note:
  180. * user can call mseal(2) multiple times, adding a seal on an
  181. * already sealed memory is a no-action (no error).
  182. *
  183. * unseal() is not supported.
  184. */
  185. int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
  186. {
  187. size_t len;
  188. int ret = 0;
  189. unsigned long end;
  190. struct mm_struct *mm = current->mm;
  191. ret = can_do_mseal(flags);
  192. if (ret)
  193. return ret;
  194. start = untagged_addr(start);
  195. if (!PAGE_ALIGNED(start))
  196. return -EINVAL;
  197. len = PAGE_ALIGN(len_in);
  198. /* Check to see whether len was rounded up from small -ve to zero. */
  199. if (len_in && !len)
  200. return -EINVAL;
  201. end = start + len;
  202. if (end < start)
  203. return -EINVAL;
  204. if (end == start)
  205. return 0;
  206. if (mmap_write_lock_killable(mm))
  207. return -EINTR;
  208. /*
  209. * First pass, this helps to avoid
  210. * partial sealing in case of error in input address range,
  211. * e.g. ENOMEM error.
  212. */
  213. ret = check_mm_seal(start, end);
  214. if (ret)
  215. goto out;
  216. /*
  217. * Second pass, this should success, unless there are errors
  218. * from vma_modify_flags, e.g. merge/split error, or process
  219. * reaching the max supported VMAs, however, those cases shall
  220. * be rare.
  221. */
  222. ret = apply_mm_seal(start, end);
  223. out:
  224. mmap_write_unlock(current->mm);
  225. return ret;
  226. }
  227. SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
  228. flags)
  229. {
  230. return do_mseal(start, len, flags);
  231. }