mprotect.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * mm/mprotect.c
  4. *
  5. * (C) Copyright 1994 Linus Torvalds
  6. * (C) Copyright 2002 Christoph Hellwig
  7. *
  8. * Address space accounting code <alan@lxorguk.ukuu.org.uk>
  9. * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
  10. */
  11. #include <linux/pagewalk.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/shm.h>
  14. #include <linux/mman.h>
  15. #include <linux/fs.h>
  16. #include <linux/highmem.h>
  17. #include <linux/security.h>
  18. #include <linux/mempolicy.h>
  19. #include <linux/personality.h>
  20. #include <linux/syscalls.h>
  21. #include <linux/swap.h>
  22. #include <linux/swapops.h>
  23. #include <linux/mmu_notifier.h>
  24. #include <linux/migrate.h>
  25. #include <linux/perf_event.h>
  26. #include <linux/pkeys.h>
  27. #include <linux/ksm.h>
  28. #include <linux/uaccess.h>
  29. #include <linux/mm_inline.h>
  30. #include <linux/pgtable.h>
  31. #include <linux/sched/sysctl.h>
  32. #include <linux/userfaultfd_k.h>
  33. #include <linux/memory-tiers.h>
  34. #include <uapi/linux/mman.h>
  35. #include <asm/cacheflush.h>
  36. #include <asm/mmu_context.h>
  37. #include <asm/tlbflush.h>
  38. #include <asm/tlb.h>
  39. #include "internal.h"
  40. bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
  41. pte_t pte)
  42. {
  43. struct page *page;
  44. if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
  45. return false;
  46. /* Don't touch entries that are not even readable. */
  47. if (pte_protnone(pte))
  48. return false;
  49. /* Do we need write faults for softdirty tracking? */
  50. if (pte_needs_soft_dirty_wp(vma, pte))
  51. return false;
  52. /* Do we need write faults for uffd-wp tracking? */
  53. if (userfaultfd_pte_wp(vma, pte))
  54. return false;
  55. if (!(vma->vm_flags & VM_SHARED)) {
  56. /*
  57. * Writable MAP_PRIVATE mapping: We can only special-case on
  58. * exclusive anonymous pages, because we know that our
  59. * write-fault handler similarly would map them writable without
  60. * any additional checks while holding the PT lock.
  61. */
  62. page = vm_normal_page(vma, addr, pte);
  63. return page && PageAnon(page) && PageAnonExclusive(page);
  64. }
  65. VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
  66. /*
  67. * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
  68. * needs a real write-fault for writenotify
  69. * (see vma_wants_writenotify()). If "dirty", the assumption is that the
  70. * FS was already notified and we can simply mark the PTE writable
  71. * just like the write-fault handler would do.
  72. */
  73. return pte_dirty(pte);
  74. }
  75. static long change_pte_range(struct mmu_gather *tlb,
  76. struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
  77. unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  78. {
  79. pte_t *pte, oldpte;
  80. spinlock_t *ptl;
  81. long pages = 0;
  82. int target_node = NUMA_NO_NODE;
  83. bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
  84. bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
  85. bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  86. tlb_change_page_size(tlb, PAGE_SIZE);
  87. pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  88. if (!pte)
  89. return -EAGAIN;
  90. /* Get target node for single threaded private VMAs */
  91. if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
  92. atomic_read(&vma->vm_mm->mm_users) == 1)
  93. target_node = numa_node_id();
  94. flush_tlb_batched_pending(vma->vm_mm);
  95. arch_enter_lazy_mmu_mode();
  96. do {
  97. oldpte = ptep_get(pte);
  98. if (pte_present(oldpte)) {
  99. pte_t ptent;
  100. /*
  101. * Avoid trapping faults against the zero or KSM
  102. * pages. See similar comment in change_huge_pmd.
  103. */
  104. if (prot_numa) {
  105. struct folio *folio;
  106. int nid;
  107. bool toptier;
  108. /* Avoid TLB flush if possible */
  109. if (pte_protnone(oldpte))
  110. continue;
  111. folio = vm_normal_folio(vma, addr, oldpte);
  112. if (!folio || folio_is_zone_device(folio) ||
  113. folio_test_ksm(folio))
  114. continue;
  115. /* Also skip shared copy-on-write pages */
  116. if (is_cow_mapping(vma->vm_flags) &&
  117. (folio_maybe_dma_pinned(folio) ||
  118. folio_likely_mapped_shared(folio)))
  119. continue;
  120. /*
  121. * While migration can move some dirty pages,
  122. * it cannot move them all from MIGRATE_ASYNC
  123. * context.
  124. */
  125. if (folio_is_file_lru(folio) &&
  126. folio_test_dirty(folio))
  127. continue;
  128. /*
  129. * Don't mess with PTEs if page is already on the node
  130. * a single-threaded process is running on.
  131. */
  132. nid = folio_nid(folio);
  133. if (target_node == nid)
  134. continue;
  135. toptier = node_is_toptier(nid);
  136. /*
  137. * Skip scanning top tier node if normal numa
  138. * balancing is disabled
  139. */
  140. if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
  141. toptier)
  142. continue;
  143. if (folio_use_access_time(folio))
  144. folio_xchg_access_time(folio,
  145. jiffies_to_msecs(jiffies));
  146. }
  147. oldpte = ptep_modify_prot_start(vma, addr, pte);
  148. ptent = pte_modify(oldpte, newprot);
  149. if (uffd_wp)
  150. ptent = pte_mkuffd_wp(ptent);
  151. else if (uffd_wp_resolve)
  152. ptent = pte_clear_uffd_wp(ptent);
  153. /*
  154. * In some writable, shared mappings, we might want
  155. * to catch actual write access -- see
  156. * vma_wants_writenotify().
  157. *
  158. * In all writable, private mappings, we have to
  159. * properly handle COW.
  160. *
  161. * In both cases, we can sometimes still change PTEs
  162. * writable and avoid the write-fault handler, for
  163. * example, if a PTE is already dirty and no other
  164. * COW or special handling is required.
  165. */
  166. if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
  167. !pte_write(ptent) &&
  168. can_change_pte_writable(vma, addr, ptent))
  169. ptent = pte_mkwrite(ptent, vma);
  170. ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
  171. if (pte_needs_flush(oldpte, ptent))
  172. tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
  173. pages++;
  174. } else if (is_swap_pte(oldpte)) {
  175. swp_entry_t entry = pte_to_swp_entry(oldpte);
  176. pte_t newpte;
  177. if (is_writable_migration_entry(entry)) {
  178. struct folio *folio = pfn_swap_entry_folio(entry);
  179. /*
  180. * A protection check is difficult so
  181. * just be safe and disable write
  182. */
  183. if (folio_test_anon(folio))
  184. entry = make_readable_exclusive_migration_entry(
  185. swp_offset(entry));
  186. else
  187. entry = make_readable_migration_entry(swp_offset(entry));
  188. newpte = swp_entry_to_pte(entry);
  189. if (pte_swp_soft_dirty(oldpte))
  190. newpte = pte_swp_mksoft_dirty(newpte);
  191. } else if (is_writable_device_private_entry(entry)) {
  192. /*
  193. * We do not preserve soft-dirtiness. See
  194. * copy_nonpresent_pte() for explanation.
  195. */
  196. entry = make_readable_device_private_entry(
  197. swp_offset(entry));
  198. newpte = swp_entry_to_pte(entry);
  199. if (pte_swp_uffd_wp(oldpte))
  200. newpte = pte_swp_mkuffd_wp(newpte);
  201. } else if (is_writable_device_exclusive_entry(entry)) {
  202. entry = make_readable_device_exclusive_entry(
  203. swp_offset(entry));
  204. newpte = swp_entry_to_pte(entry);
  205. if (pte_swp_soft_dirty(oldpte))
  206. newpte = pte_swp_mksoft_dirty(newpte);
  207. if (pte_swp_uffd_wp(oldpte))
  208. newpte = pte_swp_mkuffd_wp(newpte);
  209. } else if (is_pte_marker_entry(entry)) {
  210. /*
  211. * Ignore error swap entries unconditionally,
  212. * because any access should sigbus anyway.
  213. */
  214. if (is_poisoned_swp_entry(entry))
  215. continue;
  216. /*
  217. * If this is uffd-wp pte marker and we'd like
  218. * to unprotect it, drop it; the next page
  219. * fault will trigger without uffd trapping.
  220. */
  221. if (uffd_wp_resolve) {
  222. pte_clear(vma->vm_mm, addr, pte);
  223. pages++;
  224. }
  225. continue;
  226. } else {
  227. newpte = oldpte;
  228. }
  229. if (uffd_wp)
  230. newpte = pte_swp_mkuffd_wp(newpte);
  231. else if (uffd_wp_resolve)
  232. newpte = pte_swp_clear_uffd_wp(newpte);
  233. if (!pte_same(oldpte, newpte)) {
  234. set_pte_at(vma->vm_mm, addr, pte, newpte);
  235. pages++;
  236. }
  237. } else {
  238. /* It must be an none page, or what else?.. */
  239. WARN_ON_ONCE(!pte_none(oldpte));
  240. /*
  241. * Nobody plays with any none ptes besides
  242. * userfaultfd when applying the protections.
  243. */
  244. if (likely(!uffd_wp))
  245. continue;
  246. if (userfaultfd_wp_use_markers(vma)) {
  247. /*
  248. * For file-backed mem, we need to be able to
  249. * wr-protect a none pte, because even if the
  250. * pte is none, the page/swap cache could
  251. * exist. Doing that by install a marker.
  252. */
  253. set_pte_at(vma->vm_mm, addr, pte,
  254. make_pte_marker(PTE_MARKER_UFFD_WP));
  255. pages++;
  256. }
  257. }
  258. } while (pte++, addr += PAGE_SIZE, addr != end);
  259. arch_leave_lazy_mmu_mode();
  260. pte_unmap_unlock(pte - 1, ptl);
  261. return pages;
  262. }
  263. /*
  264. * Return true if we want to split THPs into PTE mappings in change
  265. * protection procedure, false otherwise.
  266. */
  267. static inline bool
  268. pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
  269. {
  270. /*
  271. * pte markers only resides in pte level, if we need pte markers,
  272. * we need to split. For example, we cannot wr-protect a file thp
  273. * (e.g. 2M shmem) because file thp is handled differently when
  274. * split by erasing the pmd so far.
  275. */
  276. return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
  277. }
  278. /*
  279. * Return true if we want to populate pgtables in change protection
  280. * procedure, false otherwise
  281. */
  282. static inline bool
  283. pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
  284. {
  285. /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
  286. if (!(cp_flags & MM_CP_UFFD_WP))
  287. return false;
  288. /* Populate if the userfaultfd mode requires pte markers */
  289. return userfaultfd_wp_use_markers(vma);
  290. }
  291. /*
  292. * Populate the pgtable underneath for whatever reason if requested.
  293. * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
  294. * allocation failures during page faults by kicking OOM and returning
  295. * error.
  296. */
  297. #define change_pmd_prepare(vma, pmd, cp_flags) \
  298. ({ \
  299. long err = 0; \
  300. if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
  301. if (pte_alloc(vma->vm_mm, pmd)) \
  302. err = -ENOMEM; \
  303. } \
  304. err; \
  305. })
  306. /*
  307. * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
  308. * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
  309. * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
  310. */
  311. #define change_prepare(vma, high, low, addr, cp_flags) \
  312. ({ \
  313. long err = 0; \
  314. if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
  315. low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
  316. if (p == NULL) \
  317. err = -ENOMEM; \
  318. } \
  319. err; \
  320. })
  321. static inline long change_pmd_range(struct mmu_gather *tlb,
  322. struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
  323. unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  324. {
  325. pmd_t *pmd;
  326. unsigned long next;
  327. long pages = 0;
  328. unsigned long nr_huge_updates = 0;
  329. pmd = pmd_offset(pud, addr);
  330. do {
  331. long ret;
  332. pmd_t _pmd;
  333. again:
  334. next = pmd_addr_end(addr, end);
  335. ret = change_pmd_prepare(vma, pmd, cp_flags);
  336. if (ret) {
  337. pages = ret;
  338. break;
  339. }
  340. if (pmd_none(*pmd))
  341. goto next;
  342. _pmd = pmdp_get_lockless(pmd);
  343. if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
  344. if ((next - addr != HPAGE_PMD_SIZE) ||
  345. pgtable_split_needed(vma, cp_flags)) {
  346. __split_huge_pmd(vma, pmd, addr, false, NULL);
  347. /*
  348. * For file-backed, the pmd could have been
  349. * cleared; make sure pmd populated if
  350. * necessary, then fall-through to pte level.
  351. */
  352. ret = change_pmd_prepare(vma, pmd, cp_flags);
  353. if (ret) {
  354. pages = ret;
  355. break;
  356. }
  357. } else {
  358. ret = change_huge_pmd(tlb, vma, pmd,
  359. addr, newprot, cp_flags);
  360. if (ret) {
  361. if (ret == HPAGE_PMD_NR) {
  362. pages += HPAGE_PMD_NR;
  363. nr_huge_updates++;
  364. }
  365. /* huge pmd was handled */
  366. goto next;
  367. }
  368. }
  369. /* fall through, the trans huge pmd just split */
  370. }
  371. ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
  372. cp_flags);
  373. if (ret < 0)
  374. goto again;
  375. pages += ret;
  376. next:
  377. cond_resched();
  378. } while (pmd++, addr = next, addr != end);
  379. if (nr_huge_updates)
  380. count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
  381. return pages;
  382. }
  383. static inline long change_pud_range(struct mmu_gather *tlb,
  384. struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
  385. unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  386. {
  387. struct mmu_notifier_range range;
  388. pud_t *pudp, pud;
  389. unsigned long next;
  390. long pages = 0, ret;
  391. range.start = 0;
  392. pudp = pud_offset(p4d, addr);
  393. do {
  394. again:
  395. next = pud_addr_end(addr, end);
  396. ret = change_prepare(vma, pudp, pmd, addr, cp_flags);
  397. if (ret) {
  398. pages = ret;
  399. break;
  400. }
  401. pud = READ_ONCE(*pudp);
  402. if (pud_none(pud))
  403. continue;
  404. if (!range.start) {
  405. mmu_notifier_range_init(&range,
  406. MMU_NOTIFY_PROTECTION_VMA, 0,
  407. vma->vm_mm, addr, end);
  408. mmu_notifier_invalidate_range_start(&range);
  409. }
  410. if (pud_leaf(pud)) {
  411. if ((next - addr != PUD_SIZE) ||
  412. pgtable_split_needed(vma, cp_flags)) {
  413. __split_huge_pud(vma, pudp, addr);
  414. goto again;
  415. } else {
  416. ret = change_huge_pud(tlb, vma, pudp,
  417. addr, newprot, cp_flags);
  418. if (ret == 0)
  419. goto again;
  420. /* huge pud was handled */
  421. if (ret == HPAGE_PUD_NR)
  422. pages += HPAGE_PUD_NR;
  423. continue;
  424. }
  425. }
  426. pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot,
  427. cp_flags);
  428. } while (pudp++, addr = next, addr != end);
  429. if (range.start)
  430. mmu_notifier_invalidate_range_end(&range);
  431. return pages;
  432. }
  433. static inline long change_p4d_range(struct mmu_gather *tlb,
  434. struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
  435. unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  436. {
  437. p4d_t *p4d;
  438. unsigned long next;
  439. long pages = 0, ret;
  440. p4d = p4d_offset(pgd, addr);
  441. do {
  442. next = p4d_addr_end(addr, end);
  443. ret = change_prepare(vma, p4d, pud, addr, cp_flags);
  444. if (ret)
  445. return ret;
  446. if (p4d_none_or_clear_bad(p4d))
  447. continue;
  448. pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
  449. cp_flags);
  450. } while (p4d++, addr = next, addr != end);
  451. return pages;
  452. }
  453. static long change_protection_range(struct mmu_gather *tlb,
  454. struct vm_area_struct *vma, unsigned long addr,
  455. unsigned long end, pgprot_t newprot, unsigned long cp_flags)
  456. {
  457. struct mm_struct *mm = vma->vm_mm;
  458. pgd_t *pgd;
  459. unsigned long next;
  460. long pages = 0, ret;
  461. BUG_ON(addr >= end);
  462. pgd = pgd_offset(mm, addr);
  463. tlb_start_vma(tlb, vma);
  464. do {
  465. next = pgd_addr_end(addr, end);
  466. ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
  467. if (ret) {
  468. pages = ret;
  469. break;
  470. }
  471. if (pgd_none_or_clear_bad(pgd))
  472. continue;
  473. pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
  474. cp_flags);
  475. } while (pgd++, addr = next, addr != end);
  476. tlb_end_vma(tlb, vma);
  477. return pages;
  478. }
  479. long change_protection(struct mmu_gather *tlb,
  480. struct vm_area_struct *vma, unsigned long start,
  481. unsigned long end, unsigned long cp_flags)
  482. {
  483. pgprot_t newprot = vma->vm_page_prot;
  484. long pages;
  485. BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
  486. #ifdef CONFIG_NUMA_BALANCING
  487. /*
  488. * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
  489. * are expected to reflect their requirements via VMA flags such that
  490. * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
  491. */
  492. if (cp_flags & MM_CP_PROT_NUMA)
  493. newprot = PAGE_NONE;
  494. #else
  495. WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
  496. #endif
  497. if (is_vm_hugetlb_page(vma))
  498. pages = hugetlb_change_protection(vma, start, end, newprot,
  499. cp_flags);
  500. else
  501. pages = change_protection_range(tlb, vma, start, end, newprot,
  502. cp_flags);
  503. return pages;
  504. }
  505. static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
  506. unsigned long next, struct mm_walk *walk)
  507. {
  508. return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
  509. *(pgprot_t *)(walk->private)) ?
  510. 0 : -EACCES;
  511. }
  512. static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
  513. unsigned long addr, unsigned long next,
  514. struct mm_walk *walk)
  515. {
  516. return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
  517. *(pgprot_t *)(walk->private)) ?
  518. 0 : -EACCES;
  519. }
  520. static int prot_none_test(unsigned long addr, unsigned long next,
  521. struct mm_walk *walk)
  522. {
  523. return 0;
  524. }
  525. static const struct mm_walk_ops prot_none_walk_ops = {
  526. .pte_entry = prot_none_pte_entry,
  527. .hugetlb_entry = prot_none_hugetlb_entry,
  528. .test_walk = prot_none_test,
  529. .walk_lock = PGWALK_WRLOCK,
  530. };
  531. int
  532. mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
  533. struct vm_area_struct *vma, struct vm_area_struct **pprev,
  534. unsigned long start, unsigned long end, unsigned long newflags)
  535. {
  536. struct mm_struct *mm = vma->vm_mm;
  537. unsigned long oldflags = vma->vm_flags;
  538. long nrpages = (end - start) >> PAGE_SHIFT;
  539. unsigned int mm_cp_flags = 0;
  540. unsigned long charged = 0;
  541. int error;
  542. if (!can_modify_vma(vma))
  543. return -EPERM;
  544. if (newflags == oldflags) {
  545. *pprev = vma;
  546. return 0;
  547. }
  548. /*
  549. * Do PROT_NONE PFN permission checks here when we can still
  550. * bail out without undoing a lot of state. This is a rather
  551. * uncommon case, so doesn't need to be very optimized.
  552. */
  553. if (arch_has_pfn_modify_check() &&
  554. (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  555. (newflags & VM_ACCESS_FLAGS) == 0) {
  556. pgprot_t new_pgprot = vm_get_page_prot(newflags);
  557. error = walk_page_range(current->mm, start, end,
  558. &prot_none_walk_ops, &new_pgprot);
  559. if (error)
  560. return error;
  561. }
  562. /*
  563. * If we make a private mapping writable we increase our commit;
  564. * but (without finer accounting) cannot reduce our commit if we
  565. * make it unwritable again except in the anonymous case where no
  566. * anon_vma has yet to be assigned.
  567. *
  568. * hugetlb mapping were accounted for even if read-only so there is
  569. * no need to account for them here.
  570. */
  571. if (newflags & VM_WRITE) {
  572. /* Check space limits when area turns into data. */
  573. if (!may_expand_vm(mm, newflags, nrpages) &&
  574. may_expand_vm(mm, oldflags, nrpages))
  575. return -ENOMEM;
  576. if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
  577. VM_SHARED|VM_NORESERVE))) {
  578. charged = nrpages;
  579. if (security_vm_enough_memory_mm(mm, charged))
  580. return -ENOMEM;
  581. newflags |= VM_ACCOUNT;
  582. }
  583. } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) &&
  584. !vma->anon_vma) {
  585. newflags &= ~VM_ACCOUNT;
  586. }
  587. vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
  588. if (IS_ERR(vma)) {
  589. error = PTR_ERR(vma);
  590. goto fail;
  591. }
  592. *pprev = vma;
  593. /*
  594. * vm_flags and vm_page_prot are protected by the mmap_lock
  595. * held in write mode.
  596. */
  597. vma_start_write(vma);
  598. vm_flags_reset(vma, newflags);
  599. if (vma_wants_manual_pte_write_upgrade(vma))
  600. mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
  601. vma_set_page_prot(vma);
  602. change_protection(tlb, vma, start, end, mm_cp_flags);
  603. if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT))
  604. vm_unacct_memory(nrpages);
  605. /*
  606. * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
  607. * fault on access.
  608. */
  609. if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
  610. (newflags & VM_WRITE)) {
  611. populate_vma_page_range(vma, start, end, NULL);
  612. }
  613. vm_stat_account(mm, oldflags, -nrpages);
  614. vm_stat_account(mm, newflags, nrpages);
  615. perf_event_mmap(vma);
  616. return 0;
  617. fail:
  618. vm_unacct_memory(charged);
  619. return error;
  620. }
  621. /*
  622. * pkey==-1 when doing a legacy mprotect()
  623. */
  624. static int do_mprotect_pkey(unsigned long start, size_t len,
  625. unsigned long prot, int pkey)
  626. {
  627. unsigned long nstart, end, tmp, reqprot;
  628. struct vm_area_struct *vma, *prev;
  629. int error;
  630. const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
  631. const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
  632. (prot & PROT_READ);
  633. struct mmu_gather tlb;
  634. struct vma_iterator vmi;
  635. start = untagged_addr(start);
  636. prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
  637. if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
  638. return -EINVAL;
  639. if (start & ~PAGE_MASK)
  640. return -EINVAL;
  641. if (!len)
  642. return 0;
  643. len = PAGE_ALIGN(len);
  644. end = start + len;
  645. if (end <= start)
  646. return -ENOMEM;
  647. if (!arch_validate_prot(prot, start))
  648. return -EINVAL;
  649. reqprot = prot;
  650. if (mmap_write_lock_killable(current->mm))
  651. return -EINTR;
  652. /*
  653. * If userspace did not allocate the pkey, do not let
  654. * them use it here.
  655. */
  656. error = -EINVAL;
  657. if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
  658. goto out;
  659. vma_iter_init(&vmi, current->mm, start);
  660. vma = vma_find(&vmi, end);
  661. error = -ENOMEM;
  662. if (!vma)
  663. goto out;
  664. if (unlikely(grows & PROT_GROWSDOWN)) {
  665. if (vma->vm_start >= end)
  666. goto out;
  667. start = vma->vm_start;
  668. error = -EINVAL;
  669. if (!(vma->vm_flags & VM_GROWSDOWN))
  670. goto out;
  671. } else {
  672. if (vma->vm_start > start)
  673. goto out;
  674. if (unlikely(grows & PROT_GROWSUP)) {
  675. end = vma->vm_end;
  676. error = -EINVAL;
  677. if (!(vma->vm_flags & VM_GROWSUP))
  678. goto out;
  679. }
  680. }
  681. prev = vma_prev(&vmi);
  682. if (start > vma->vm_start)
  683. prev = vma;
  684. tlb_gather_mmu(&tlb, current->mm);
  685. nstart = start;
  686. tmp = vma->vm_start;
  687. for_each_vma_range(vmi, vma, end) {
  688. unsigned long mask_off_old_flags;
  689. unsigned long newflags;
  690. int new_vma_pkey;
  691. if (vma->vm_start != tmp) {
  692. error = -ENOMEM;
  693. break;
  694. }
  695. /* Does the application expect PROT_READ to imply PROT_EXEC */
  696. if (rier && (vma->vm_flags & VM_MAYEXEC))
  697. prot |= PROT_EXEC;
  698. /*
  699. * Each mprotect() call explicitly passes r/w/x permissions.
  700. * If a permission is not passed to mprotect(), it must be
  701. * cleared from the VMA.
  702. */
  703. mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
  704. new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
  705. newflags = calc_vm_prot_bits(prot, new_vma_pkey);
  706. newflags |= (vma->vm_flags & ~mask_off_old_flags);
  707. /* newflags >> 4 shift VM_MAY% in place of VM_% */
  708. if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
  709. error = -EACCES;
  710. break;
  711. }
  712. if (map_deny_write_exec(vma->vm_flags, newflags)) {
  713. error = -EACCES;
  714. break;
  715. }
  716. /* Allow architectures to sanity-check the new flags */
  717. if (!arch_validate_flags(newflags)) {
  718. error = -EINVAL;
  719. break;
  720. }
  721. error = security_file_mprotect(vma, reqprot, prot);
  722. if (error)
  723. break;
  724. tmp = vma->vm_end;
  725. if (tmp > end)
  726. tmp = end;
  727. if (vma->vm_ops && vma->vm_ops->mprotect) {
  728. error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
  729. if (error)
  730. break;
  731. }
  732. error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
  733. if (error)
  734. break;
  735. tmp = vma_iter_end(&vmi);
  736. nstart = tmp;
  737. prot = reqprot;
  738. }
  739. tlb_finish_mmu(&tlb);
  740. if (!error && tmp < end)
  741. error = -ENOMEM;
  742. out:
  743. mmap_write_unlock(current->mm);
  744. return error;
  745. }
  746. SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
  747. unsigned long, prot)
  748. {
  749. return do_mprotect_pkey(start, len, prot, -1);
  750. }
  751. #ifdef CONFIG_ARCH_HAS_PKEYS
  752. SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
  753. unsigned long, prot, int, pkey)
  754. {
  755. return do_mprotect_pkey(start, len, prot, pkey);
  756. }
  757. SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
  758. {
  759. int pkey;
  760. int ret;
  761. /* No flags supported yet. */
  762. if (flags)
  763. return -EINVAL;
  764. /* check for unsupported init values */
  765. if (init_val & ~PKEY_ACCESS_MASK)
  766. return -EINVAL;
  767. mmap_write_lock(current->mm);
  768. pkey = mm_pkey_alloc(current->mm);
  769. ret = -ENOSPC;
  770. if (pkey == -1)
  771. goto out;
  772. ret = arch_set_user_pkey_access(current, pkey, init_val);
  773. if (ret) {
  774. mm_pkey_free(current->mm, pkey);
  775. goto out;
  776. }
  777. ret = pkey;
  778. out:
  779. mmap_write_unlock(current->mm);
  780. return ret;
  781. }
  782. SYSCALL_DEFINE1(pkey_free, int, pkey)
  783. {
  784. int ret;
  785. mmap_write_lock(current->mm);
  786. ret = mm_pkey_free(current->mm, pkey);
  787. mmap_write_unlock(current->mm);
  788. /*
  789. * We could provide warnings or errors if any VMA still
  790. * has the pkey set here.
  791. */
  792. return ret;
  793. }
  794. #endif /* CONFIG_ARCH_HAS_PKEYS */