mlock.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/mlock.c
  4. *
  5. * (C) Copyright 1995 Linus Torvalds
  6. * (C) Copyright 2002 Christoph Hellwig
  7. */
  8. #include <linux/capability.h>
  9. #include <linux/mman.h>
  10. #include <linux/mm.h>
  11. #include <linux/sched/user.h>
  12. #include <linux/swap.h>
  13. #include <linux/swapops.h>
  14. #include <linux/pagemap.h>
  15. #include <linux/pagevec.h>
  16. #include <linux/pagewalk.h>
  17. #include <linux/mempolicy.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/sched.h>
  20. #include <linux/export.h>
  21. #include <linux/rmap.h>
  22. #include <linux/mmzone.h>
  23. #include <linux/hugetlb.h>
  24. #include <linux/memcontrol.h>
  25. #include <linux/mm_inline.h>
  26. #include <linux/secretmem.h>
  27. #include "internal.h"
  28. struct mlock_fbatch {
  29. local_lock_t lock;
  30. struct folio_batch fbatch;
  31. };
  32. static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
  33. .lock = INIT_LOCAL_LOCK(lock),
  34. };
  35. bool can_do_mlock(void)
  36. {
  37. if (rlimit(RLIMIT_MEMLOCK) != 0)
  38. return true;
  39. if (capable(CAP_IPC_LOCK))
  40. return true;
  41. return false;
  42. }
  43. EXPORT_SYMBOL(can_do_mlock);
  44. /*
  45. * Mlocked folios are marked with the PG_mlocked flag for efficient testing
  46. * in vmscan and, possibly, the fault path; and to support semi-accurate
  47. * statistics.
  48. *
  49. * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it
  50. * will be ostensibly placed on the LRU "unevictable" list (actually no such
  51. * list exists), rather than the [in]active lists. PG_unevictable is set to
  52. * indicate the unevictable state.
  53. */
  54. static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
  55. {
  56. /* There is nothing more we can do while it's off LRU */
  57. if (!folio_test_clear_lru(folio))
  58. return lruvec;
  59. lruvec = folio_lruvec_relock_irq(folio, lruvec);
  60. if (unlikely(folio_evictable(folio))) {
  61. /*
  62. * This is a little surprising, but quite possible: PG_mlocked
  63. * must have got cleared already by another CPU. Could this
  64. * folio be unevictable? I'm not sure, but move it now if so.
  65. */
  66. if (folio_test_unevictable(folio)) {
  67. lruvec_del_folio(lruvec, folio);
  68. folio_clear_unevictable(folio);
  69. lruvec_add_folio(lruvec, folio);
  70. __count_vm_events(UNEVICTABLE_PGRESCUED,
  71. folio_nr_pages(folio));
  72. }
  73. goto out;
  74. }
  75. if (folio_test_unevictable(folio)) {
  76. if (folio_test_mlocked(folio))
  77. folio->mlock_count++;
  78. goto out;
  79. }
  80. lruvec_del_folio(lruvec, folio);
  81. folio_clear_active(folio);
  82. folio_set_unevictable(folio);
  83. folio->mlock_count = !!folio_test_mlocked(folio);
  84. lruvec_add_folio(lruvec, folio);
  85. __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
  86. out:
  87. folio_set_lru(folio);
  88. return lruvec;
  89. }
  90. static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
  91. {
  92. VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  93. lruvec = folio_lruvec_relock_irq(folio, lruvec);
  94. /* As above, this is a little surprising, but possible */
  95. if (unlikely(folio_evictable(folio)))
  96. goto out;
  97. folio_set_unevictable(folio);
  98. folio->mlock_count = !!folio_test_mlocked(folio);
  99. __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
  100. out:
  101. lruvec_add_folio(lruvec, folio);
  102. folio_set_lru(folio);
  103. return lruvec;
  104. }
  105. static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
  106. {
  107. int nr_pages = folio_nr_pages(folio);
  108. bool isolated = false;
  109. if (!folio_test_clear_lru(folio))
  110. goto munlock;
  111. isolated = true;
  112. lruvec = folio_lruvec_relock_irq(folio, lruvec);
  113. if (folio_test_unevictable(folio)) {
  114. /* Then mlock_count is maintained, but might undercount */
  115. if (folio->mlock_count)
  116. folio->mlock_count--;
  117. if (folio->mlock_count)
  118. goto out;
  119. }
  120. /* else assume that was the last mlock: reclaim will fix it if not */
  121. munlock:
  122. if (folio_test_clear_mlocked(folio)) {
  123. __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
  124. if (isolated || !folio_test_unevictable(folio))
  125. __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
  126. else
  127. __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
  128. }
  129. /* folio_evictable() has to be checked *after* clearing Mlocked */
  130. if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
  131. lruvec_del_folio(lruvec, folio);
  132. folio_clear_unevictable(folio);
  133. lruvec_add_folio(lruvec, folio);
  134. __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
  135. }
  136. out:
  137. if (isolated)
  138. folio_set_lru(folio);
  139. return lruvec;
  140. }
  141. /*
  142. * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
  143. */
  144. #define LRU_FOLIO 0x1
  145. #define NEW_FOLIO 0x2
  146. static inline struct folio *mlock_lru(struct folio *folio)
  147. {
  148. return (struct folio *)((unsigned long)folio + LRU_FOLIO);
  149. }
  150. static inline struct folio *mlock_new(struct folio *folio)
  151. {
  152. return (struct folio *)((unsigned long)folio + NEW_FOLIO);
  153. }
  154. /*
  155. * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
  156. * make use of such folio pointer flags in future, but for now just keep it for
  157. * mlock. We could use three separate folio batches instead, but one feels
  158. * better (munlocking a full folio batch does not need to drain mlocking folio
  159. * batches first).
  160. */
  161. static void mlock_folio_batch(struct folio_batch *fbatch)
  162. {
  163. struct lruvec *lruvec = NULL;
  164. unsigned long mlock;
  165. struct folio *folio;
  166. int i;
  167. for (i = 0; i < folio_batch_count(fbatch); i++) {
  168. folio = fbatch->folios[i];
  169. mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
  170. folio = (struct folio *)((unsigned long)folio - mlock);
  171. fbatch->folios[i] = folio;
  172. if (mlock & LRU_FOLIO)
  173. lruvec = __mlock_folio(folio, lruvec);
  174. else if (mlock & NEW_FOLIO)
  175. lruvec = __mlock_new_folio(folio, lruvec);
  176. else
  177. lruvec = __munlock_folio(folio, lruvec);
  178. }
  179. if (lruvec)
  180. unlock_page_lruvec_irq(lruvec);
  181. folios_put(fbatch);
  182. }
  183. void mlock_drain_local(void)
  184. {
  185. struct folio_batch *fbatch;
  186. local_lock(&mlock_fbatch.lock);
  187. fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
  188. if (folio_batch_count(fbatch))
  189. mlock_folio_batch(fbatch);
  190. local_unlock(&mlock_fbatch.lock);
  191. }
  192. void mlock_drain_remote(int cpu)
  193. {
  194. struct folio_batch *fbatch;
  195. WARN_ON_ONCE(cpu_online(cpu));
  196. fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
  197. if (folio_batch_count(fbatch))
  198. mlock_folio_batch(fbatch);
  199. }
  200. bool need_mlock_drain(int cpu)
  201. {
  202. return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
  203. }
  204. /**
  205. * mlock_folio - mlock a folio already on (or temporarily off) LRU
  206. * @folio: folio to be mlocked.
  207. */
  208. void mlock_folio(struct folio *folio)
  209. {
  210. struct folio_batch *fbatch;
  211. local_lock(&mlock_fbatch.lock);
  212. fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
  213. if (!folio_test_set_mlocked(folio)) {
  214. int nr_pages = folio_nr_pages(folio);
  215. zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
  216. __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
  217. }
  218. folio_get(folio);
  219. if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
  220. folio_test_large(folio) || lru_cache_disabled())
  221. mlock_folio_batch(fbatch);
  222. local_unlock(&mlock_fbatch.lock);
  223. }
  224. /**
  225. * mlock_new_folio - mlock a newly allocated folio not yet on LRU
  226. * @folio: folio to be mlocked, either normal or a THP head.
  227. */
  228. void mlock_new_folio(struct folio *folio)
  229. {
  230. struct folio_batch *fbatch;
  231. int nr_pages = folio_nr_pages(folio);
  232. local_lock(&mlock_fbatch.lock);
  233. fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
  234. folio_set_mlocked(folio);
  235. zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
  236. __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
  237. folio_get(folio);
  238. if (!folio_batch_add(fbatch, mlock_new(folio)) ||
  239. folio_test_large(folio) || lru_cache_disabled())
  240. mlock_folio_batch(fbatch);
  241. local_unlock(&mlock_fbatch.lock);
  242. }
  243. /**
  244. * munlock_folio - munlock a folio
  245. * @folio: folio to be munlocked, either normal or a THP head.
  246. */
  247. void munlock_folio(struct folio *folio)
  248. {
  249. struct folio_batch *fbatch;
  250. local_lock(&mlock_fbatch.lock);
  251. fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
  252. /*
  253. * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
  254. * which will check whether the folio is multiply mlocked.
  255. */
  256. folio_get(folio);
  257. if (!folio_batch_add(fbatch, folio) ||
  258. folio_test_large(folio) || lru_cache_disabled())
  259. mlock_folio_batch(fbatch);
  260. local_unlock(&mlock_fbatch.lock);
  261. }
  262. static inline unsigned int folio_mlock_step(struct folio *folio,
  263. pte_t *pte, unsigned long addr, unsigned long end)
  264. {
  265. const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
  266. unsigned int count = (end - addr) >> PAGE_SHIFT;
  267. pte_t ptent = ptep_get(pte);
  268. if (!folio_test_large(folio))
  269. return 1;
  270. return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL,
  271. NULL, NULL);
  272. }
  273. static inline bool allow_mlock_munlock(struct folio *folio,
  274. struct vm_area_struct *vma, unsigned long start,
  275. unsigned long end, unsigned int step)
  276. {
  277. /*
  278. * For unlock, allow munlock large folio which is partially
  279. * mapped to VMA. As it's possible that large folio is
  280. * mlocked and VMA is split later.
  281. *
  282. * During memory pressure, such kind of large folio can
  283. * be split. And the pages are not in VM_LOCKed VMA
  284. * can be reclaimed.
  285. */
  286. if (!(vma->vm_flags & VM_LOCKED))
  287. return true;
  288. /* folio_within_range() cannot take KSM, but any small folio is OK */
  289. if (!folio_test_large(folio))
  290. return true;
  291. /* folio not in range [start, end), skip mlock */
  292. if (!folio_within_range(folio, vma, start, end))
  293. return false;
  294. /* folio is not fully mapped, skip mlock */
  295. if (step != folio_nr_pages(folio))
  296. return false;
  297. return true;
  298. }
  299. static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
  300. unsigned long end, struct mm_walk *walk)
  301. {
  302. struct vm_area_struct *vma = walk->vma;
  303. spinlock_t *ptl;
  304. pte_t *start_pte, *pte;
  305. pte_t ptent;
  306. struct folio *folio;
  307. unsigned int step = 1;
  308. unsigned long start = addr;
  309. ptl = pmd_trans_huge_lock(pmd, vma);
  310. if (ptl) {
  311. if (!pmd_present(*pmd))
  312. goto out;
  313. if (is_huge_zero_pmd(*pmd))
  314. goto out;
  315. folio = pmd_folio(*pmd);
  316. if (vma->vm_flags & VM_LOCKED)
  317. mlock_folio(folio);
  318. else
  319. munlock_folio(folio);
  320. goto out;
  321. }
  322. start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  323. if (!start_pte) {
  324. walk->action = ACTION_AGAIN;
  325. return 0;
  326. }
  327. for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
  328. ptent = ptep_get(pte);
  329. if (!pte_present(ptent))
  330. continue;
  331. folio = vm_normal_folio(vma, addr, ptent);
  332. if (!folio || folio_is_zone_device(folio))
  333. continue;
  334. step = folio_mlock_step(folio, pte, addr, end);
  335. if (!allow_mlock_munlock(folio, vma, start, end, step))
  336. goto next_entry;
  337. if (vma->vm_flags & VM_LOCKED)
  338. mlock_folio(folio);
  339. else
  340. munlock_folio(folio);
  341. next_entry:
  342. pte += step - 1;
  343. addr += (step - 1) << PAGE_SHIFT;
  344. }
  345. pte_unmap(start_pte);
  346. out:
  347. spin_unlock(ptl);
  348. cond_resched();
  349. return 0;
  350. }
  351. /*
  352. * mlock_vma_pages_range() - mlock any pages already in the range,
  353. * or munlock all pages in the range.
  354. * @vma - vma containing range to be mlock()ed or munlock()ed
  355. * @start - start address in @vma of the range
  356. * @end - end of range in @vma
  357. * @newflags - the new set of flags for @vma.
  358. *
  359. * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
  360. * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
  361. */
  362. static void mlock_vma_pages_range(struct vm_area_struct *vma,
  363. unsigned long start, unsigned long end, vm_flags_t newflags)
  364. {
  365. static const struct mm_walk_ops mlock_walk_ops = {
  366. .pmd_entry = mlock_pte_range,
  367. .walk_lock = PGWALK_WRLOCK_VERIFY,
  368. };
  369. /*
  370. * There is a slight chance that concurrent page migration,
  371. * or page reclaim finding a page of this now-VM_LOCKED vma,
  372. * will call mlock_vma_folio() and raise page's mlock_count:
  373. * double counting, leaving the page unevictable indefinitely.
  374. * Communicate this danger to mlock_vma_folio() with VM_IO,
  375. * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
  376. * mmap_lock is held in write mode here, so this weird
  377. * combination should not be visible to other mmap_lock users;
  378. * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
  379. */
  380. if (newflags & VM_LOCKED)
  381. newflags |= VM_IO;
  382. vma_start_write(vma);
  383. vm_flags_reset_once(vma, newflags);
  384. lru_add_drain();
  385. walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
  386. lru_add_drain();
  387. if (newflags & VM_IO) {
  388. newflags &= ~VM_IO;
  389. vm_flags_reset_once(vma, newflags);
  390. }
  391. }
  392. /*
  393. * mlock_fixup - handle mlock[all]/munlock[all] requests.
  394. *
  395. * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
  396. * munlock is a no-op. However, for some special vmas, we go ahead and
  397. * populate the ptes.
  398. *
  399. * For vmas that pass the filters, merge/split as appropriate.
  400. */
  401. static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
  402. struct vm_area_struct **prev, unsigned long start,
  403. unsigned long end, vm_flags_t newflags)
  404. {
  405. struct mm_struct *mm = vma->vm_mm;
  406. int nr_pages;
  407. int ret = 0;
  408. vm_flags_t oldflags = vma->vm_flags;
  409. if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
  410. is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
  411. vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
  412. /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
  413. goto out;
  414. vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
  415. if (IS_ERR(vma)) {
  416. ret = PTR_ERR(vma);
  417. goto out;
  418. }
  419. /*
  420. * Keep track of amount of locked VM.
  421. */
  422. nr_pages = (end - start) >> PAGE_SHIFT;
  423. if (!(newflags & VM_LOCKED))
  424. nr_pages = -nr_pages;
  425. else if (oldflags & VM_LOCKED)
  426. nr_pages = 0;
  427. mm->locked_vm += nr_pages;
  428. /*
  429. * vm_flags is protected by the mmap_lock held in write mode.
  430. * It's okay if try_to_unmap_one unmaps a page just after we
  431. * set VM_LOCKED, populate_vma_page_range will bring it back.
  432. */
  433. if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
  434. /* No work to do, and mlocking twice would be wrong */
  435. vma_start_write(vma);
  436. vm_flags_reset(vma, newflags);
  437. } else {
  438. mlock_vma_pages_range(vma, start, end, newflags);
  439. }
  440. out:
  441. *prev = vma;
  442. return ret;
  443. }
  444. static int apply_vma_lock_flags(unsigned long start, size_t len,
  445. vm_flags_t flags)
  446. {
  447. unsigned long nstart, end, tmp;
  448. struct vm_area_struct *vma, *prev;
  449. VMA_ITERATOR(vmi, current->mm, start);
  450. VM_BUG_ON(offset_in_page(start));
  451. VM_BUG_ON(len != PAGE_ALIGN(len));
  452. end = start + len;
  453. if (end < start)
  454. return -EINVAL;
  455. if (end == start)
  456. return 0;
  457. vma = vma_iter_load(&vmi);
  458. if (!vma)
  459. return -ENOMEM;
  460. prev = vma_prev(&vmi);
  461. if (start > vma->vm_start)
  462. prev = vma;
  463. nstart = start;
  464. tmp = vma->vm_start;
  465. for_each_vma_range(vmi, vma, end) {
  466. int error;
  467. vm_flags_t newflags;
  468. if (vma->vm_start != tmp)
  469. return -ENOMEM;
  470. newflags = vma->vm_flags & ~VM_LOCKED_MASK;
  471. newflags |= flags;
  472. /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
  473. tmp = vma->vm_end;
  474. if (tmp > end)
  475. tmp = end;
  476. error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
  477. if (error)
  478. return error;
  479. tmp = vma_iter_end(&vmi);
  480. nstart = tmp;
  481. }
  482. if (tmp < end)
  483. return -ENOMEM;
  484. return 0;
  485. }
  486. /*
  487. * Go through vma areas and sum size of mlocked
  488. * vma pages, as return value.
  489. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
  490. * is also counted.
  491. * Return value: previously mlocked page counts
  492. */
  493. static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
  494. unsigned long start, size_t len)
  495. {
  496. struct vm_area_struct *vma;
  497. unsigned long count = 0;
  498. unsigned long end;
  499. VMA_ITERATOR(vmi, mm, start);
  500. /* Don't overflow past ULONG_MAX */
  501. if (unlikely(ULONG_MAX - len < start))
  502. end = ULONG_MAX;
  503. else
  504. end = start + len;
  505. for_each_vma_range(vmi, vma, end) {
  506. if (vma->vm_flags & VM_LOCKED) {
  507. if (start > vma->vm_start)
  508. count -= (start - vma->vm_start);
  509. if (end < vma->vm_end) {
  510. count += end - vma->vm_start;
  511. break;
  512. }
  513. count += vma->vm_end - vma->vm_start;
  514. }
  515. }
  516. return count >> PAGE_SHIFT;
  517. }
  518. /*
  519. * convert get_user_pages() return value to posix mlock() error
  520. */
  521. static int __mlock_posix_error_return(long retval)
  522. {
  523. if (retval == -EFAULT)
  524. retval = -ENOMEM;
  525. else if (retval == -ENOMEM)
  526. retval = -EAGAIN;
  527. return retval;
  528. }
  529. static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
  530. {
  531. unsigned long locked;
  532. unsigned long lock_limit;
  533. int error = -ENOMEM;
  534. start = untagged_addr(start);
  535. if (!can_do_mlock())
  536. return -EPERM;
  537. len = PAGE_ALIGN(len + (offset_in_page(start)));
  538. start &= PAGE_MASK;
  539. lock_limit = rlimit(RLIMIT_MEMLOCK);
  540. lock_limit >>= PAGE_SHIFT;
  541. locked = len >> PAGE_SHIFT;
  542. if (mmap_write_lock_killable(current->mm))
  543. return -EINTR;
  544. locked += current->mm->locked_vm;
  545. if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
  546. /*
  547. * It is possible that the regions requested intersect with
  548. * previously mlocked areas, that part area in "mm->locked_vm"
  549. * should not be counted to new mlock increment count. So check
  550. * and adjust locked count if necessary.
  551. */
  552. locked -= count_mm_mlocked_page_nr(current->mm,
  553. start, len);
  554. }
  555. /* check against resource limits */
  556. if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
  557. error = apply_vma_lock_flags(start, len, flags);
  558. mmap_write_unlock(current->mm);
  559. if (error)
  560. return error;
  561. error = __mm_populate(start, len, 0);
  562. if (error)
  563. return __mlock_posix_error_return(error);
  564. return 0;
  565. }
  566. SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
  567. {
  568. return do_mlock(start, len, VM_LOCKED);
  569. }
  570. SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
  571. {
  572. vm_flags_t vm_flags = VM_LOCKED;
  573. if (flags & ~MLOCK_ONFAULT)
  574. return -EINVAL;
  575. if (flags & MLOCK_ONFAULT)
  576. vm_flags |= VM_LOCKONFAULT;
  577. return do_mlock(start, len, vm_flags);
  578. }
  579. SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
  580. {
  581. int ret;
  582. start = untagged_addr(start);
  583. len = PAGE_ALIGN(len + (offset_in_page(start)));
  584. start &= PAGE_MASK;
  585. if (mmap_write_lock_killable(current->mm))
  586. return -EINTR;
  587. ret = apply_vma_lock_flags(start, len, 0);
  588. mmap_write_unlock(current->mm);
  589. return ret;
  590. }
  591. /*
  592. * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
  593. * and translate into the appropriate modifications to mm->def_flags and/or the
  594. * flags for all current VMAs.
  595. *
  596. * There are a couple of subtleties with this. If mlockall() is called multiple
  597. * times with different flags, the values do not necessarily stack. If mlockall
  598. * is called once including the MCL_FUTURE flag and then a second time without
  599. * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
  600. */
  601. static int apply_mlockall_flags(int flags)
  602. {
  603. VMA_ITERATOR(vmi, current->mm, 0);
  604. struct vm_area_struct *vma, *prev = NULL;
  605. vm_flags_t to_add = 0;
  606. current->mm->def_flags &= ~VM_LOCKED_MASK;
  607. if (flags & MCL_FUTURE) {
  608. current->mm->def_flags |= VM_LOCKED;
  609. if (flags & MCL_ONFAULT)
  610. current->mm->def_flags |= VM_LOCKONFAULT;
  611. if (!(flags & MCL_CURRENT))
  612. goto out;
  613. }
  614. if (flags & MCL_CURRENT) {
  615. to_add |= VM_LOCKED;
  616. if (flags & MCL_ONFAULT)
  617. to_add |= VM_LOCKONFAULT;
  618. }
  619. for_each_vma(vmi, vma) {
  620. int error;
  621. vm_flags_t newflags;
  622. newflags = vma->vm_flags & ~VM_LOCKED_MASK;
  623. newflags |= to_add;
  624. error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
  625. newflags);
  626. /* Ignore errors, but prev needs fixing up. */
  627. if (error)
  628. prev = vma;
  629. cond_resched();
  630. }
  631. out:
  632. return 0;
  633. }
  634. SYSCALL_DEFINE1(mlockall, int, flags)
  635. {
  636. unsigned long lock_limit;
  637. int ret;
  638. if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
  639. flags == MCL_ONFAULT)
  640. return -EINVAL;
  641. if (!can_do_mlock())
  642. return -EPERM;
  643. lock_limit = rlimit(RLIMIT_MEMLOCK);
  644. lock_limit >>= PAGE_SHIFT;
  645. if (mmap_write_lock_killable(current->mm))
  646. return -EINTR;
  647. ret = -ENOMEM;
  648. if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
  649. capable(CAP_IPC_LOCK))
  650. ret = apply_mlockall_flags(flags);
  651. mmap_write_unlock(current->mm);
  652. if (!ret && (flags & MCL_CURRENT))
  653. mm_populate(0, TASK_SIZE);
  654. return ret;
  655. }
  656. SYSCALL_DEFINE0(munlockall)
  657. {
  658. int ret;
  659. if (mmap_write_lock_killable(current->mm))
  660. return -EINTR;
  661. ret = apply_mlockall_flags(0);
  662. mmap_write_unlock(current->mm);
  663. return ret;
  664. }
  665. /*
  666. * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
  667. * shm segments) get accounted against the user_struct instead.
  668. */
  669. static DEFINE_SPINLOCK(shmlock_user_lock);
  670. int user_shm_lock(size_t size, struct ucounts *ucounts)
  671. {
  672. unsigned long lock_limit, locked;
  673. long memlock;
  674. int allowed = 0;
  675. locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
  676. lock_limit = rlimit(RLIMIT_MEMLOCK);
  677. if (lock_limit != RLIM_INFINITY)
  678. lock_limit >>= PAGE_SHIFT;
  679. spin_lock(&shmlock_user_lock);
  680. memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
  681. if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
  682. dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
  683. goto out;
  684. }
  685. if (!get_ucounts(ucounts)) {
  686. dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
  687. allowed = 0;
  688. goto out;
  689. }
  690. allowed = 1;
  691. out:
  692. spin_unlock(&shmlock_user_lock);
  693. return allowed;
  694. }
  695. void user_shm_unlock(size_t size, struct ucounts *ucounts)
  696. {
  697. spin_lock(&shmlock_user_lock);
  698. dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
  699. spin_unlock(&shmlock_user_lock);
  700. put_ucounts(ucounts);
  701. }