mmap.c 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * mm/mmap.c
  4. *
  5. * Written by obz.
  6. *
  7. * Address space accounting code <alan@lxorguk.ukuu.org.uk>
  8. */
  9. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10. #include <linux/kernel.h>
  11. #include <linux/slab.h>
  12. #include <linux/backing-dev.h>
  13. #include <linux/mm.h>
  14. #include <linux/mm_inline.h>
  15. #include <linux/shm.h>
  16. #include <linux/mman.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/swap.h>
  19. #include <linux/syscalls.h>
  20. #include <linux/capability.h>
  21. #include <linux/init.h>
  22. #include <linux/file.h>
  23. #include <linux/fs.h>
  24. #include <linux/personality.h>
  25. #include <linux/security.h>
  26. #include <linux/hugetlb.h>
  27. #include <linux/shmem_fs.h>
  28. #include <linux/profile.h>
  29. #include <linux/export.h>
  30. #include <linux/mount.h>
  31. #include <linux/mempolicy.h>
  32. #include <linux/rmap.h>
  33. #include <linux/mmu_notifier.h>
  34. #include <linux/mmdebug.h>
  35. #include <linux/perf_event.h>
  36. #include <linux/audit.h>
  37. #include <linux/khugepaged.h>
  38. #include <linux/uprobes.h>
  39. #include <linux/notifier.h>
  40. #include <linux/memory.h>
  41. #include <linux/printk.h>
  42. #include <linux/userfaultfd_k.h>
  43. #include <linux/moduleparam.h>
  44. #include <linux/pkeys.h>
  45. #include <linux/oom.h>
  46. #include <linux/sched/mm.h>
  47. #include <linux/ksm.h>
  48. #include <linux/memfd.h>
  49. #include <linux/uaccess.h>
  50. #include <asm/cacheflush.h>
  51. #include <asm/tlb.h>
  52. #include <asm/mmu_context.h>
  53. #define CREATE_TRACE_POINTS
  54. #include <trace/events/mmap.h>
  55. #include "internal.h"
  56. #ifndef arch_mmap_check
  57. #define arch_mmap_check(addr, len, flags) (0)
  58. #endif
  59. #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
  60. const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
  61. int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
  62. int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
  63. #endif
  64. #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
  65. const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
  66. const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
  67. int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
  68. #endif
  69. static bool ignore_rlimit_data;
  70. core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
  71. /* Update vma->vm_page_prot to reflect vma->vm_flags. */
  72. void vma_set_page_prot(struct vm_area_struct *vma)
  73. {
  74. unsigned long vm_flags = vma->vm_flags;
  75. pgprot_t vm_page_prot;
  76. vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
  77. if (vma_wants_writenotify(vma, vm_page_prot)) {
  78. vm_flags &= ~VM_SHARED;
  79. vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
  80. }
  81. /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
  82. WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
  83. }
  84. /*
  85. * check_brk_limits() - Use platform specific check of range & verify mlock
  86. * limits.
  87. * @addr: The address to check
  88. * @len: The size of increase.
  89. *
  90. * Return: 0 on success.
  91. */
  92. static int check_brk_limits(unsigned long addr, unsigned long len)
  93. {
  94. unsigned long mapped_addr;
  95. mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
  96. if (IS_ERR_VALUE(mapped_addr))
  97. return mapped_addr;
  98. return mlock_future_ok(current->mm, current->mm->def_flags, len)
  99. ? 0 : -EAGAIN;
  100. }
  101. static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
  102. unsigned long addr, unsigned long request, unsigned long flags);
  103. SYSCALL_DEFINE1(brk, unsigned long, brk)
  104. {
  105. unsigned long newbrk, oldbrk, origbrk;
  106. struct mm_struct *mm = current->mm;
  107. struct vm_area_struct *brkvma, *next = NULL;
  108. unsigned long min_brk;
  109. bool populate = false;
  110. LIST_HEAD(uf);
  111. struct vma_iterator vmi;
  112. if (mmap_write_lock_killable(mm))
  113. return -EINTR;
  114. origbrk = mm->brk;
  115. #ifdef CONFIG_COMPAT_BRK
  116. /*
  117. * CONFIG_COMPAT_BRK can still be overridden by setting
  118. * randomize_va_space to 2, which will still cause mm->start_brk
  119. * to be arbitrarily shifted
  120. */
  121. if (current->brk_randomized)
  122. min_brk = mm->start_brk;
  123. else
  124. min_brk = mm->end_data;
  125. #else
  126. min_brk = mm->start_brk;
  127. #endif
  128. if (brk < min_brk)
  129. goto out;
  130. /*
  131. * Check against rlimit here. If this check is done later after the test
  132. * of oldbrk with newbrk then it can escape the test and let the data
  133. * segment grow beyond its set limit the in case where the limit is
  134. * not page aligned -Ram Gupta
  135. */
  136. if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
  137. mm->end_data, mm->start_data))
  138. goto out;
  139. newbrk = PAGE_ALIGN(brk);
  140. oldbrk = PAGE_ALIGN(mm->brk);
  141. if (oldbrk == newbrk) {
  142. mm->brk = brk;
  143. goto success;
  144. }
  145. /* Always allow shrinking brk. */
  146. if (brk <= mm->brk) {
  147. /* Search one past newbrk */
  148. vma_iter_init(&vmi, mm, newbrk);
  149. brkvma = vma_find(&vmi, oldbrk);
  150. if (!brkvma || brkvma->vm_start >= oldbrk)
  151. goto out; /* mapping intersects with an existing non-brk vma. */
  152. /*
  153. * mm->brk must be protected by write mmap_lock.
  154. * do_vmi_align_munmap() will drop the lock on success, so
  155. * update it before calling do_vma_munmap().
  156. */
  157. mm->brk = brk;
  158. if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf,
  159. /* unlock = */ true))
  160. goto out;
  161. goto success_unlocked;
  162. }
  163. if (check_brk_limits(oldbrk, newbrk - oldbrk))
  164. goto out;
  165. /*
  166. * Only check if the next VMA is within the stack_guard_gap of the
  167. * expansion area
  168. */
  169. vma_iter_init(&vmi, mm, oldbrk);
  170. next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
  171. if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
  172. goto out;
  173. brkvma = vma_prev_limit(&vmi, mm->start_brk);
  174. /* Ok, looks good - let it rip. */
  175. if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
  176. goto out;
  177. mm->brk = brk;
  178. if (mm->def_flags & VM_LOCKED)
  179. populate = true;
  180. success:
  181. mmap_write_unlock(mm);
  182. success_unlocked:
  183. userfaultfd_unmap_complete(mm, &uf);
  184. if (populate)
  185. mm_populate(oldbrk, newbrk - oldbrk);
  186. return brk;
  187. out:
  188. mm->brk = origbrk;
  189. mmap_write_unlock(mm);
  190. return origbrk;
  191. }
  192. /*
  193. * If a hint addr is less than mmap_min_addr change hint to be as
  194. * low as possible but still greater than mmap_min_addr
  195. */
  196. static inline unsigned long round_hint_to_min(unsigned long hint)
  197. {
  198. hint &= PAGE_MASK;
  199. if (((void *)hint != NULL) &&
  200. (hint < mmap_min_addr))
  201. return PAGE_ALIGN(mmap_min_addr);
  202. return hint;
  203. }
  204. bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
  205. unsigned long bytes)
  206. {
  207. unsigned long locked_pages, limit_pages;
  208. if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
  209. return true;
  210. locked_pages = bytes >> PAGE_SHIFT;
  211. locked_pages += mm->locked_vm;
  212. limit_pages = rlimit(RLIMIT_MEMLOCK);
  213. limit_pages >>= PAGE_SHIFT;
  214. return locked_pages <= limit_pages;
  215. }
  216. static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
  217. {
  218. if (S_ISREG(inode->i_mode))
  219. return MAX_LFS_FILESIZE;
  220. if (S_ISBLK(inode->i_mode))
  221. return MAX_LFS_FILESIZE;
  222. if (S_ISSOCK(inode->i_mode))
  223. return MAX_LFS_FILESIZE;
  224. /* Special "we do even unsigned file positions" case */
  225. if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
  226. return 0;
  227. /* Yes, random drivers might want more. But I'm tired of buggy drivers */
  228. return ULONG_MAX;
  229. }
  230. static inline bool file_mmap_ok(struct file *file, struct inode *inode,
  231. unsigned long pgoff, unsigned long len)
  232. {
  233. u64 maxsize = file_mmap_size_max(file, inode);
  234. if (maxsize && len > maxsize)
  235. return false;
  236. maxsize -= len;
  237. if (pgoff > maxsize >> PAGE_SHIFT)
  238. return false;
  239. return true;
  240. }
  241. /*
  242. * The caller must write-lock current->mm->mmap_lock.
  243. */
  244. unsigned long do_mmap(struct file *file, unsigned long addr,
  245. unsigned long len, unsigned long prot,
  246. unsigned long flags, vm_flags_t vm_flags,
  247. unsigned long pgoff, unsigned long *populate,
  248. struct list_head *uf)
  249. {
  250. struct mm_struct *mm = current->mm;
  251. int pkey = 0;
  252. *populate = 0;
  253. if (!len)
  254. return -EINVAL;
  255. /*
  256. * Does the application expect PROT_READ to imply PROT_EXEC?
  257. *
  258. * (the exception is when the underlying filesystem is noexec
  259. * mounted, in which case we don't add PROT_EXEC.)
  260. */
  261. if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  262. if (!(file && path_noexec(&file->f_path)))
  263. prot |= PROT_EXEC;
  264. /* force arch specific MAP_FIXED handling in get_unmapped_area */
  265. if (flags & MAP_FIXED_NOREPLACE)
  266. flags |= MAP_FIXED;
  267. if (!(flags & MAP_FIXED))
  268. addr = round_hint_to_min(addr);
  269. /* Careful about overflows.. */
  270. len = PAGE_ALIGN(len);
  271. if (!len)
  272. return -ENOMEM;
  273. /* offset overflow? */
  274. if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  275. return -EOVERFLOW;
  276. /* Too many mappings? */
  277. if (mm->map_count > sysctl_max_map_count)
  278. return -ENOMEM;
  279. /*
  280. * addr is returned from get_unmapped_area,
  281. * There are two cases:
  282. * 1> MAP_FIXED == false
  283. * unallocated memory, no need to check sealing.
  284. * 1> MAP_FIXED == true
  285. * sealing is checked inside mmap_region when
  286. * do_vmi_munmap is called.
  287. */
  288. if (prot == PROT_EXEC) {
  289. pkey = execute_only_pkey(mm);
  290. if (pkey < 0)
  291. pkey = 0;
  292. }
  293. /* Do simple checking here so the lower-level routines won't have
  294. * to. we assume access permissions have been handled by the open
  295. * of the memory object, so we don't do any here.
  296. */
  297. vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
  298. mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  299. /* Obtain the address to map to. we verify (or select) it and ensure
  300. * that it represents a valid section of the address space.
  301. */
  302. addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
  303. if (IS_ERR_VALUE(addr))
  304. return addr;
  305. if (flags & MAP_FIXED_NOREPLACE) {
  306. if (find_vma_intersection(mm, addr, addr + len))
  307. return -EEXIST;
  308. }
  309. if (flags & MAP_LOCKED)
  310. if (!can_do_mlock())
  311. return -EPERM;
  312. if (!mlock_future_ok(mm, vm_flags, len))
  313. return -EAGAIN;
  314. if (file) {
  315. struct inode *inode = file_inode(file);
  316. unsigned int seals = memfd_file_seals(file);
  317. unsigned long flags_mask;
  318. if (!file_mmap_ok(file, inode, pgoff, len))
  319. return -EOVERFLOW;
  320. flags_mask = LEGACY_MAP_MASK;
  321. if (file->f_op->fop_flags & FOP_MMAP_SYNC)
  322. flags_mask |= MAP_SYNC;
  323. switch (flags & MAP_TYPE) {
  324. case MAP_SHARED:
  325. /*
  326. * Force use of MAP_SHARED_VALIDATE with non-legacy
  327. * flags. E.g. MAP_SYNC is dangerous to use with
  328. * MAP_SHARED as you don't know which consistency model
  329. * you will get. We silently ignore unsupported flags
  330. * with MAP_SHARED to preserve backward compatibility.
  331. */
  332. flags &= LEGACY_MAP_MASK;
  333. fallthrough;
  334. case MAP_SHARED_VALIDATE:
  335. if (flags & ~flags_mask)
  336. return -EOPNOTSUPP;
  337. if (prot & PROT_WRITE) {
  338. if (!(file->f_mode & FMODE_WRITE))
  339. return -EACCES;
  340. if (IS_SWAPFILE(file->f_mapping->host))
  341. return -ETXTBSY;
  342. }
  343. /*
  344. * Make sure we don't allow writing to an append-only
  345. * file..
  346. */
  347. if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  348. return -EACCES;
  349. vm_flags |= VM_SHARED | VM_MAYSHARE;
  350. if (!(file->f_mode & FMODE_WRITE))
  351. vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  352. else if (is_readonly_sealed(seals, vm_flags))
  353. vm_flags &= ~VM_MAYWRITE;
  354. fallthrough;
  355. case MAP_PRIVATE:
  356. if (!(file->f_mode & FMODE_READ))
  357. return -EACCES;
  358. if (path_noexec(&file->f_path)) {
  359. if (vm_flags & VM_EXEC)
  360. return -EPERM;
  361. vm_flags &= ~VM_MAYEXEC;
  362. }
  363. if (!file->f_op->mmap)
  364. return -ENODEV;
  365. if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  366. return -EINVAL;
  367. break;
  368. default:
  369. return -EINVAL;
  370. }
  371. } else {
  372. switch (flags & MAP_TYPE) {
  373. case MAP_SHARED:
  374. if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  375. return -EINVAL;
  376. /*
  377. * Ignore pgoff.
  378. */
  379. pgoff = 0;
  380. vm_flags |= VM_SHARED | VM_MAYSHARE;
  381. break;
  382. case MAP_DROPPABLE:
  383. if (VM_DROPPABLE == VM_NONE)
  384. return -ENOTSUPP;
  385. /*
  386. * A locked or stack area makes no sense to be droppable.
  387. *
  388. * Also, since droppable pages can just go away at any time
  389. * it makes no sense to copy them on fork or dump them.
  390. *
  391. * And don't attempt to combine with hugetlb for now.
  392. */
  393. if (flags & (MAP_LOCKED | MAP_HUGETLB))
  394. return -EINVAL;
  395. if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
  396. return -EINVAL;
  397. vm_flags |= VM_DROPPABLE;
  398. /*
  399. * If the pages can be dropped, then it doesn't make
  400. * sense to reserve them.
  401. */
  402. vm_flags |= VM_NORESERVE;
  403. /*
  404. * Likewise, they're volatile enough that they
  405. * shouldn't survive forks or coredumps.
  406. */
  407. vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
  408. fallthrough;
  409. case MAP_PRIVATE:
  410. /*
  411. * Set pgoff according to addr for anon_vma.
  412. */
  413. pgoff = addr >> PAGE_SHIFT;
  414. break;
  415. default:
  416. return -EINVAL;
  417. }
  418. }
  419. /*
  420. * Set 'VM_NORESERVE' if we should not account for the
  421. * memory use of this mapping.
  422. */
  423. if (flags & MAP_NORESERVE) {
  424. /* We honor MAP_NORESERVE if allowed to overcommit */
  425. if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
  426. vm_flags |= VM_NORESERVE;
  427. /* hugetlb applies strict overcommit unless MAP_NORESERVE */
  428. if (file && is_file_hugepages(file))
  429. vm_flags |= VM_NORESERVE;
  430. }
  431. addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
  432. if (!IS_ERR_VALUE(addr) &&
  433. ((vm_flags & VM_LOCKED) ||
  434. (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
  435. *populate = len;
  436. return addr;
  437. }
  438. unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
  439. unsigned long prot, unsigned long flags,
  440. unsigned long fd, unsigned long pgoff)
  441. {
  442. struct file *file = NULL;
  443. unsigned long retval;
  444. if (!(flags & MAP_ANONYMOUS)) {
  445. audit_mmap_fd(fd, flags);
  446. file = fget(fd);
  447. if (!file)
  448. return -EBADF;
  449. if (is_file_hugepages(file)) {
  450. len = ALIGN(len, huge_page_size(hstate_file(file)));
  451. } else if (unlikely(flags & MAP_HUGETLB)) {
  452. retval = -EINVAL;
  453. goto out_fput;
  454. }
  455. } else if (flags & MAP_HUGETLB) {
  456. struct hstate *hs;
  457. hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
  458. if (!hs)
  459. return -EINVAL;
  460. len = ALIGN(len, huge_page_size(hs));
  461. /*
  462. * VM_NORESERVE is used because the reservations will be
  463. * taken when vm_ops->mmap() is called
  464. */
  465. file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
  466. VM_NORESERVE,
  467. HUGETLB_ANONHUGE_INODE,
  468. (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
  469. if (IS_ERR(file))
  470. return PTR_ERR(file);
  471. }
  472. retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
  473. out_fput:
  474. if (file)
  475. fput(file);
  476. return retval;
  477. }
  478. SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
  479. unsigned long, prot, unsigned long, flags,
  480. unsigned long, fd, unsigned long, pgoff)
  481. {
  482. return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
  483. }
  484. #ifdef __ARCH_WANT_SYS_OLD_MMAP
  485. struct mmap_arg_struct {
  486. unsigned long addr;
  487. unsigned long len;
  488. unsigned long prot;
  489. unsigned long flags;
  490. unsigned long fd;
  491. unsigned long offset;
  492. };
  493. SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  494. {
  495. struct mmap_arg_struct a;
  496. if (copy_from_user(&a, arg, sizeof(a)))
  497. return -EFAULT;
  498. if (offset_in_page(a.offset))
  499. return -EINVAL;
  500. return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
  501. a.offset >> PAGE_SHIFT);
  502. }
  503. #endif /* __ARCH_WANT_SYS_OLD_MMAP */
  504. /*
  505. * We account for memory if it's a private writeable mapping,
  506. * not hugepages and VM_NORESERVE wasn't set.
  507. */
  508. static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
  509. {
  510. /*
  511. * hugetlb has its own accounting separate from the core VM
  512. * VM_HUGETLB may not be set yet so we cannot check for that flag.
  513. */
  514. if (file && is_file_hugepages(file))
  515. return false;
  516. return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
  517. }
  518. /**
  519. * unmapped_area() - Find an area between the low_limit and the high_limit with
  520. * the correct alignment and offset, all from @info. Note: current->mm is used
  521. * for the search.
  522. *
  523. * @info: The unmapped area information including the range [low_limit -
  524. * high_limit), the alignment offset and mask.
  525. *
  526. * Return: A memory address or -ENOMEM.
  527. */
  528. static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
  529. {
  530. unsigned long length, gap;
  531. unsigned long low_limit, high_limit;
  532. struct vm_area_struct *tmp;
  533. VMA_ITERATOR(vmi, current->mm, 0);
  534. /* Adjust search length to account for worst case alignment overhead */
  535. length = info->length + info->align_mask + info->start_gap;
  536. if (length < info->length)
  537. return -ENOMEM;
  538. low_limit = info->low_limit;
  539. if (low_limit < mmap_min_addr)
  540. low_limit = mmap_min_addr;
  541. high_limit = info->high_limit;
  542. retry:
  543. if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
  544. return -ENOMEM;
  545. /*
  546. * Adjust for the gap first so it doesn't interfere with the
  547. * later alignment. The first step is the minimum needed to
  548. * fulill the start gap, the next steps is the minimum to align
  549. * that. It is the minimum needed to fulill both.
  550. */
  551. gap = vma_iter_addr(&vmi) + info->start_gap;
  552. gap += (info->align_offset - gap) & info->align_mask;
  553. tmp = vma_next(&vmi);
  554. if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
  555. if (vm_start_gap(tmp) < gap + length - 1) {
  556. low_limit = tmp->vm_end;
  557. vma_iter_reset(&vmi);
  558. goto retry;
  559. }
  560. } else {
  561. tmp = vma_prev(&vmi);
  562. if (tmp && vm_end_gap(tmp) > gap) {
  563. low_limit = vm_end_gap(tmp);
  564. vma_iter_reset(&vmi);
  565. goto retry;
  566. }
  567. }
  568. return gap;
  569. }
  570. /**
  571. * unmapped_area_topdown() - Find an area between the low_limit and the
  572. * high_limit with the correct alignment and offset at the highest available
  573. * address, all from @info. Note: current->mm is used for the search.
  574. *
  575. * @info: The unmapped area information including the range [low_limit -
  576. * high_limit), the alignment offset and mask.
  577. *
  578. * Return: A memory address or -ENOMEM.
  579. */
  580. static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
  581. {
  582. unsigned long length, gap, gap_end;
  583. unsigned long low_limit, high_limit;
  584. struct vm_area_struct *tmp;
  585. VMA_ITERATOR(vmi, current->mm, 0);
  586. /* Adjust search length to account for worst case alignment overhead */
  587. length = info->length + info->align_mask + info->start_gap;
  588. if (length < info->length)
  589. return -ENOMEM;
  590. low_limit = info->low_limit;
  591. if (low_limit < mmap_min_addr)
  592. low_limit = mmap_min_addr;
  593. high_limit = info->high_limit;
  594. retry:
  595. if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
  596. return -ENOMEM;
  597. gap = vma_iter_end(&vmi) - info->length;
  598. gap -= (gap - info->align_offset) & info->align_mask;
  599. gap_end = vma_iter_end(&vmi);
  600. tmp = vma_next(&vmi);
  601. if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
  602. if (vm_start_gap(tmp) < gap_end) {
  603. high_limit = vm_start_gap(tmp);
  604. vma_iter_reset(&vmi);
  605. goto retry;
  606. }
  607. } else {
  608. tmp = vma_prev(&vmi);
  609. if (tmp && vm_end_gap(tmp) > gap) {
  610. high_limit = tmp->vm_start;
  611. vma_iter_reset(&vmi);
  612. goto retry;
  613. }
  614. }
  615. return gap;
  616. }
  617. /*
  618. * Determine if the allocation needs to ensure that there is no
  619. * existing mapping within it's guard gaps, for use as start_gap.
  620. */
  621. static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
  622. {
  623. if (vm_flags & VM_SHADOW_STACK)
  624. return PAGE_SIZE;
  625. return 0;
  626. }
  627. /*
  628. * Search for an unmapped address range.
  629. *
  630. * We are looking for a range that:
  631. * - does not intersect with any VMA;
  632. * - is contained within the [low_limit, high_limit) interval;
  633. * - is at least the desired size.
  634. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
  635. */
  636. unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
  637. {
  638. unsigned long addr;
  639. if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
  640. addr = unmapped_area_topdown(info);
  641. else
  642. addr = unmapped_area(info);
  643. trace_vm_unmapped_area(addr, info);
  644. return addr;
  645. }
  646. /* Get an address range which is currently unmapped.
  647. * For shmat() with addr=0.
  648. *
  649. * Ugly calling convention alert:
  650. * Return value with the low bits set means error value,
  651. * ie
  652. * if (ret & ~PAGE_MASK)
  653. * error = ret;
  654. *
  655. * This function "knows" that -ENOMEM has the bits set.
  656. */
  657. unsigned long
  658. generic_get_unmapped_area(struct file *filp, unsigned long addr,
  659. unsigned long len, unsigned long pgoff,
  660. unsigned long flags, vm_flags_t vm_flags)
  661. {
  662. struct mm_struct *mm = current->mm;
  663. struct vm_area_struct *vma, *prev;
  664. struct vm_unmapped_area_info info = {};
  665. const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
  666. if (len > mmap_end - mmap_min_addr)
  667. return -ENOMEM;
  668. if (flags & MAP_FIXED)
  669. return addr;
  670. if (addr) {
  671. addr = PAGE_ALIGN(addr);
  672. vma = find_vma_prev(mm, addr, &prev);
  673. if (mmap_end - len >= addr && addr >= mmap_min_addr &&
  674. (!vma || addr + len <= vm_start_gap(vma)) &&
  675. (!prev || addr >= vm_end_gap(prev)))
  676. return addr;
  677. }
  678. info.length = len;
  679. info.low_limit = mm->mmap_base;
  680. info.high_limit = mmap_end;
  681. info.start_gap = stack_guard_placement(vm_flags);
  682. return vm_unmapped_area(&info);
  683. }
  684. #ifndef HAVE_ARCH_UNMAPPED_AREA
  685. unsigned long
  686. arch_get_unmapped_area(struct file *filp, unsigned long addr,
  687. unsigned long len, unsigned long pgoff,
  688. unsigned long flags, vm_flags_t vm_flags)
  689. {
  690. return generic_get_unmapped_area(filp, addr, len, pgoff, flags,
  691. vm_flags);
  692. }
  693. #endif
  694. /*
  695. * This mmap-allocator allocates new areas top-down from below the
  696. * stack's low limit (the base):
  697. */
  698. unsigned long
  699. generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
  700. unsigned long len, unsigned long pgoff,
  701. unsigned long flags, vm_flags_t vm_flags)
  702. {
  703. struct vm_area_struct *vma, *prev;
  704. struct mm_struct *mm = current->mm;
  705. struct vm_unmapped_area_info info = {};
  706. const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
  707. /* requested length too big for entire address space */
  708. if (len > mmap_end - mmap_min_addr)
  709. return -ENOMEM;
  710. if (flags & MAP_FIXED)
  711. return addr;
  712. /* requesting a specific address */
  713. if (addr) {
  714. addr = PAGE_ALIGN(addr);
  715. vma = find_vma_prev(mm, addr, &prev);
  716. if (mmap_end - len >= addr && addr >= mmap_min_addr &&
  717. (!vma || addr + len <= vm_start_gap(vma)) &&
  718. (!prev || addr >= vm_end_gap(prev)))
  719. return addr;
  720. }
  721. info.flags = VM_UNMAPPED_AREA_TOPDOWN;
  722. info.length = len;
  723. info.low_limit = PAGE_SIZE;
  724. info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
  725. info.start_gap = stack_guard_placement(vm_flags);
  726. addr = vm_unmapped_area(&info);
  727. /*
  728. * A failed mmap() very likely causes application failure,
  729. * so fall back to the bottom-up function here. This scenario
  730. * can happen with large stack limits and large mmap()
  731. * allocations.
  732. */
  733. if (offset_in_page(addr)) {
  734. VM_BUG_ON(addr != -ENOMEM);
  735. info.flags = 0;
  736. info.low_limit = TASK_UNMAPPED_BASE;
  737. info.high_limit = mmap_end;
  738. addr = vm_unmapped_area(&info);
  739. }
  740. return addr;
  741. }
  742. #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
  743. unsigned long
  744. arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
  745. unsigned long len, unsigned long pgoff,
  746. unsigned long flags, vm_flags_t vm_flags)
  747. {
  748. return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags,
  749. vm_flags);
  750. }
  751. #endif
  752. unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
  753. unsigned long addr, unsigned long len,
  754. unsigned long pgoff, unsigned long flags,
  755. vm_flags_t vm_flags)
  756. {
  757. if (test_bit(MMF_TOPDOWN, &mm->flags))
  758. return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
  759. flags, vm_flags);
  760. return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
  761. }
  762. unsigned long
  763. __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
  764. unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
  765. {
  766. unsigned long (*get_area)(struct file *, unsigned long,
  767. unsigned long, unsigned long, unsigned long)
  768. = NULL;
  769. unsigned long error = arch_mmap_check(addr, len, flags);
  770. if (error)
  771. return error;
  772. /* Careful about overflows.. */
  773. if (len > TASK_SIZE)
  774. return -ENOMEM;
  775. if (file) {
  776. if (file->f_op->get_unmapped_area)
  777. get_area = file->f_op->get_unmapped_area;
  778. } else if (flags & MAP_SHARED) {
  779. /*
  780. * mmap_region() will call shmem_zero_setup() to create a file,
  781. * so use shmem's get_unmapped_area in case it can be huge.
  782. */
  783. get_area = shmem_get_unmapped_area;
  784. }
  785. /* Always treat pgoff as zero for anonymous memory. */
  786. if (!file)
  787. pgoff = 0;
  788. if (get_area) {
  789. addr = get_area(file, addr, len, pgoff, flags);
  790. } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)
  791. && !addr /* no hint */
  792. && IS_ALIGNED(len, PMD_SIZE)) {
  793. /* Ensures that larger anonymous mappings are THP aligned. */
  794. addr = thp_get_unmapped_area_vmflags(file, addr, len,
  795. pgoff, flags, vm_flags);
  796. } else {
  797. addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
  798. pgoff, flags, vm_flags);
  799. }
  800. if (IS_ERR_VALUE(addr))
  801. return addr;
  802. if (addr > TASK_SIZE - len)
  803. return -ENOMEM;
  804. if (offset_in_page(addr))
  805. return -EINVAL;
  806. error = security_mmap_addr(addr);
  807. return error ? error : addr;
  808. }
  809. unsigned long
  810. mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
  811. unsigned long addr, unsigned long len,
  812. unsigned long pgoff, unsigned long flags)
  813. {
  814. if (test_bit(MMF_TOPDOWN, &mm->flags))
  815. return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
  816. return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
  817. }
  818. EXPORT_SYMBOL(mm_get_unmapped_area);
  819. /**
  820. * find_vma_intersection() - Look up the first VMA which intersects the interval
  821. * @mm: The process address space.
  822. * @start_addr: The inclusive start user address.
  823. * @end_addr: The exclusive end user address.
  824. *
  825. * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
  826. * start_addr < end_addr.
  827. */
  828. struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
  829. unsigned long start_addr,
  830. unsigned long end_addr)
  831. {
  832. unsigned long index = start_addr;
  833. mmap_assert_locked(mm);
  834. return mt_find(&mm->mm_mt, &index, end_addr - 1);
  835. }
  836. EXPORT_SYMBOL(find_vma_intersection);
  837. /**
  838. * find_vma() - Find the VMA for a given address, or the next VMA.
  839. * @mm: The mm_struct to check
  840. * @addr: The address
  841. *
  842. * Returns: The VMA associated with addr, or the next VMA.
  843. * May return %NULL in the case of no VMA at addr or above.
  844. */
  845. struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
  846. {
  847. unsigned long index = addr;
  848. mmap_assert_locked(mm);
  849. return mt_find(&mm->mm_mt, &index, ULONG_MAX);
  850. }
  851. EXPORT_SYMBOL(find_vma);
  852. /**
  853. * find_vma_prev() - Find the VMA for a given address, or the next vma and
  854. * set %pprev to the previous VMA, if any.
  855. * @mm: The mm_struct to check
  856. * @addr: The address
  857. * @pprev: The pointer to set to the previous VMA
  858. *
  859. * Note that RCU lock is missing here since the external mmap_lock() is used
  860. * instead.
  861. *
  862. * Returns: The VMA associated with @addr, or the next vma.
  863. * May return %NULL in the case of no vma at addr or above.
  864. */
  865. struct vm_area_struct *
  866. find_vma_prev(struct mm_struct *mm, unsigned long addr,
  867. struct vm_area_struct **pprev)
  868. {
  869. struct vm_area_struct *vma;
  870. VMA_ITERATOR(vmi, mm, addr);
  871. vma = vma_iter_load(&vmi);
  872. *pprev = vma_prev(&vmi);
  873. if (!vma)
  874. vma = vma_next(&vmi);
  875. return vma;
  876. }
  877. /*
  878. * Verify that the stack growth is acceptable and
  879. * update accounting. This is shared with both the
  880. * grow-up and grow-down cases.
  881. */
  882. static int acct_stack_growth(struct vm_area_struct *vma,
  883. unsigned long size, unsigned long grow)
  884. {
  885. struct mm_struct *mm = vma->vm_mm;
  886. unsigned long new_start;
  887. /* address space limit tests */
  888. if (!may_expand_vm(mm, vma->vm_flags, grow))
  889. return -ENOMEM;
  890. /* Stack limit test */
  891. if (size > rlimit(RLIMIT_STACK))
  892. return -ENOMEM;
  893. /* mlock limit tests */
  894. if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
  895. return -ENOMEM;
  896. /* Check to ensure the stack will not grow into a hugetlb-only region */
  897. new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
  898. vma->vm_end - size;
  899. if (is_hugepage_only_range(vma->vm_mm, new_start, size))
  900. return -EFAULT;
  901. /*
  902. * Overcommit.. This must be the final test, as it will
  903. * update security statistics.
  904. */
  905. if (security_vm_enough_memory_mm(mm, grow))
  906. return -ENOMEM;
  907. return 0;
  908. }
  909. #if defined(CONFIG_STACK_GROWSUP)
  910. /*
  911. * PA-RISC uses this for its stack.
  912. * vma is the last one with address > vma->vm_end. Have to extend vma.
  913. */
  914. static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  915. {
  916. struct mm_struct *mm = vma->vm_mm;
  917. struct vm_area_struct *next;
  918. unsigned long gap_addr;
  919. int error = 0;
  920. VMA_ITERATOR(vmi, mm, vma->vm_start);
  921. if (!(vma->vm_flags & VM_GROWSUP))
  922. return -EFAULT;
  923. /* Guard against exceeding limits of the address space. */
  924. address &= PAGE_MASK;
  925. if (address >= (TASK_SIZE & PAGE_MASK))
  926. return -ENOMEM;
  927. address += PAGE_SIZE;
  928. /* Enforce stack_guard_gap */
  929. gap_addr = address + stack_guard_gap;
  930. /* Guard against overflow */
  931. if (gap_addr < address || gap_addr > TASK_SIZE)
  932. gap_addr = TASK_SIZE;
  933. next = find_vma_intersection(mm, vma->vm_end, gap_addr);
  934. if (next && vma_is_accessible(next)) {
  935. if (!(next->vm_flags & VM_GROWSUP))
  936. return -ENOMEM;
  937. /* Check that both stack segments have the same anon_vma? */
  938. }
  939. if (next)
  940. vma_iter_prev_range_limit(&vmi, address);
  941. vma_iter_config(&vmi, vma->vm_start, address);
  942. if (vma_iter_prealloc(&vmi, vma))
  943. return -ENOMEM;
  944. /* We must make sure the anon_vma is allocated. */
  945. if (unlikely(anon_vma_prepare(vma))) {
  946. vma_iter_free(&vmi);
  947. return -ENOMEM;
  948. }
  949. /* Lock the VMA before expanding to prevent concurrent page faults */
  950. vma_start_write(vma);
  951. /*
  952. * vma->vm_start/vm_end cannot change under us because the caller
  953. * is required to hold the mmap_lock in read mode. We need the
  954. * anon_vma lock to serialize against concurrent expand_stacks.
  955. */
  956. anon_vma_lock_write(vma->anon_vma);
  957. /* Somebody else might have raced and expanded it already */
  958. if (address > vma->vm_end) {
  959. unsigned long size, grow;
  960. size = address - vma->vm_start;
  961. grow = (address - vma->vm_end) >> PAGE_SHIFT;
  962. error = -ENOMEM;
  963. if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
  964. error = acct_stack_growth(vma, size, grow);
  965. if (!error) {
  966. /*
  967. * We only hold a shared mmap_lock lock here, so
  968. * we need to protect against concurrent vma
  969. * expansions. anon_vma_lock_write() doesn't
  970. * help here, as we don't guarantee that all
  971. * growable vmas in a mm share the same root
  972. * anon vma. So, we reuse mm->page_table_lock
  973. * to guard against concurrent vma expansions.
  974. */
  975. spin_lock(&mm->page_table_lock);
  976. if (vma->vm_flags & VM_LOCKED)
  977. mm->locked_vm += grow;
  978. vm_stat_account(mm, vma->vm_flags, grow);
  979. anon_vma_interval_tree_pre_update_vma(vma);
  980. vma->vm_end = address;
  981. /* Overwrite old entry in mtree. */
  982. vma_iter_store(&vmi, vma);
  983. anon_vma_interval_tree_post_update_vma(vma);
  984. spin_unlock(&mm->page_table_lock);
  985. perf_event_mmap(vma);
  986. }
  987. }
  988. }
  989. anon_vma_unlock_write(vma->anon_vma);
  990. vma_iter_free(&vmi);
  991. validate_mm(mm);
  992. return error;
  993. }
  994. #endif /* CONFIG_STACK_GROWSUP */
  995. /*
  996. * vma is the first one with address < vma->vm_start. Have to extend vma.
  997. * mmap_lock held for writing.
  998. */
  999. int expand_downwards(struct vm_area_struct *vma, unsigned long address)
  1000. {
  1001. struct mm_struct *mm = vma->vm_mm;
  1002. struct vm_area_struct *prev;
  1003. int error = 0;
  1004. VMA_ITERATOR(vmi, mm, vma->vm_start);
  1005. if (!(vma->vm_flags & VM_GROWSDOWN))
  1006. return -EFAULT;
  1007. address &= PAGE_MASK;
  1008. if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
  1009. return -EPERM;
  1010. /* Enforce stack_guard_gap */
  1011. prev = vma_prev(&vmi);
  1012. /* Check that both stack segments have the same anon_vma? */
  1013. if (prev) {
  1014. if (!(prev->vm_flags & VM_GROWSDOWN) &&
  1015. vma_is_accessible(prev) &&
  1016. (address - prev->vm_end < stack_guard_gap))
  1017. return -ENOMEM;
  1018. }
  1019. if (prev)
  1020. vma_iter_next_range_limit(&vmi, vma->vm_start);
  1021. vma_iter_config(&vmi, address, vma->vm_end);
  1022. if (vma_iter_prealloc(&vmi, vma))
  1023. return -ENOMEM;
  1024. /* We must make sure the anon_vma is allocated. */
  1025. if (unlikely(anon_vma_prepare(vma))) {
  1026. vma_iter_free(&vmi);
  1027. return -ENOMEM;
  1028. }
  1029. /* Lock the VMA before expanding to prevent concurrent page faults */
  1030. vma_start_write(vma);
  1031. /*
  1032. * vma->vm_start/vm_end cannot change under us because the caller
  1033. * is required to hold the mmap_lock in read mode. We need the
  1034. * anon_vma lock to serialize against concurrent expand_stacks.
  1035. */
  1036. anon_vma_lock_write(vma->anon_vma);
  1037. /* Somebody else might have raced and expanded it already */
  1038. if (address < vma->vm_start) {
  1039. unsigned long size, grow;
  1040. size = vma->vm_end - address;
  1041. grow = (vma->vm_start - address) >> PAGE_SHIFT;
  1042. error = -ENOMEM;
  1043. if (grow <= vma->vm_pgoff) {
  1044. error = acct_stack_growth(vma, size, grow);
  1045. if (!error) {
  1046. /*
  1047. * We only hold a shared mmap_lock lock here, so
  1048. * we need to protect against concurrent vma
  1049. * expansions. anon_vma_lock_write() doesn't
  1050. * help here, as we don't guarantee that all
  1051. * growable vmas in a mm share the same root
  1052. * anon vma. So, we reuse mm->page_table_lock
  1053. * to guard against concurrent vma expansions.
  1054. */
  1055. spin_lock(&mm->page_table_lock);
  1056. if (vma->vm_flags & VM_LOCKED)
  1057. mm->locked_vm += grow;
  1058. vm_stat_account(mm, vma->vm_flags, grow);
  1059. anon_vma_interval_tree_pre_update_vma(vma);
  1060. vma->vm_start = address;
  1061. vma->vm_pgoff -= grow;
  1062. /* Overwrite old entry in mtree. */
  1063. vma_iter_store(&vmi, vma);
  1064. anon_vma_interval_tree_post_update_vma(vma);
  1065. spin_unlock(&mm->page_table_lock);
  1066. perf_event_mmap(vma);
  1067. }
  1068. }
  1069. }
  1070. anon_vma_unlock_write(vma->anon_vma);
  1071. vma_iter_free(&vmi);
  1072. validate_mm(mm);
  1073. return error;
  1074. }
  1075. /* enforced gap between the expanding stack and other mappings. */
  1076. unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
  1077. static int __init cmdline_parse_stack_guard_gap(char *p)
  1078. {
  1079. unsigned long val;
  1080. char *endptr;
  1081. val = simple_strtoul(p, &endptr, 10);
  1082. if (!*endptr)
  1083. stack_guard_gap = val << PAGE_SHIFT;
  1084. return 1;
  1085. }
  1086. __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
  1087. #ifdef CONFIG_STACK_GROWSUP
  1088. int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
  1089. {
  1090. return expand_upwards(vma, address);
  1091. }
  1092. struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
  1093. {
  1094. struct vm_area_struct *vma, *prev;
  1095. addr &= PAGE_MASK;
  1096. vma = find_vma_prev(mm, addr, &prev);
  1097. if (vma && (vma->vm_start <= addr))
  1098. return vma;
  1099. if (!prev)
  1100. return NULL;
  1101. if (expand_stack_locked(prev, addr))
  1102. return NULL;
  1103. if (prev->vm_flags & VM_LOCKED)
  1104. populate_vma_page_range(prev, addr, prev->vm_end, NULL);
  1105. return prev;
  1106. }
  1107. #else
  1108. int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
  1109. {
  1110. return expand_downwards(vma, address);
  1111. }
  1112. struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
  1113. {
  1114. struct vm_area_struct *vma;
  1115. unsigned long start;
  1116. addr &= PAGE_MASK;
  1117. vma = find_vma(mm, addr);
  1118. if (!vma)
  1119. return NULL;
  1120. if (vma->vm_start <= addr)
  1121. return vma;
  1122. start = vma->vm_start;
  1123. if (expand_stack_locked(vma, addr))
  1124. return NULL;
  1125. if (vma->vm_flags & VM_LOCKED)
  1126. populate_vma_page_range(vma, addr, start, NULL);
  1127. return vma;
  1128. }
  1129. #endif
  1130. #if defined(CONFIG_STACK_GROWSUP)
  1131. #define vma_expand_up(vma,addr) expand_upwards(vma, addr)
  1132. #define vma_expand_down(vma, addr) (-EFAULT)
  1133. #else
  1134. #define vma_expand_up(vma,addr) (-EFAULT)
  1135. #define vma_expand_down(vma, addr) expand_downwards(vma, addr)
  1136. #endif
  1137. /*
  1138. * expand_stack(): legacy interface for page faulting. Don't use unless
  1139. * you have to.
  1140. *
  1141. * This is called with the mm locked for reading, drops the lock, takes
  1142. * the lock for writing, tries to look up a vma again, expands it if
  1143. * necessary, and downgrades the lock to reading again.
  1144. *
  1145. * If no vma is found or it can't be expanded, it returns NULL and has
  1146. * dropped the lock.
  1147. */
  1148. struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
  1149. {
  1150. struct vm_area_struct *vma, *prev;
  1151. mmap_read_unlock(mm);
  1152. if (mmap_write_lock_killable(mm))
  1153. return NULL;
  1154. vma = find_vma_prev(mm, addr, &prev);
  1155. if (vma && vma->vm_start <= addr)
  1156. goto success;
  1157. if (prev && !vma_expand_up(prev, addr)) {
  1158. vma = prev;
  1159. goto success;
  1160. }
  1161. if (vma && !vma_expand_down(vma, addr))
  1162. goto success;
  1163. mmap_write_unlock(mm);
  1164. return NULL;
  1165. success:
  1166. mmap_write_downgrade(mm);
  1167. return vma;
  1168. }
  1169. /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
  1170. * @mm: The mm_struct
  1171. * @start: The start address to munmap
  1172. * @len: The length to be munmapped.
  1173. * @uf: The userfaultfd list_head
  1174. *
  1175. * Return: 0 on success, error otherwise.
  1176. */
  1177. int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
  1178. struct list_head *uf)
  1179. {
  1180. VMA_ITERATOR(vmi, mm, start);
  1181. return do_vmi_munmap(&vmi, mm, start, len, uf, false);
  1182. }
  1183. static unsigned long __mmap_region(struct file *file, unsigned long addr,
  1184. unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
  1185. struct list_head *uf)
  1186. {
  1187. struct mm_struct *mm = current->mm;
  1188. struct vm_area_struct *vma = NULL;
  1189. pgoff_t pglen = PHYS_PFN(len);
  1190. unsigned long charged = 0;
  1191. struct vma_munmap_struct vms;
  1192. struct ma_state mas_detach;
  1193. struct maple_tree mt_detach;
  1194. unsigned long end = addr + len;
  1195. int error;
  1196. VMA_ITERATOR(vmi, mm, addr);
  1197. VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
  1198. vmg.file = file;
  1199. /* Find the first overlapping VMA */
  1200. vma = vma_find(&vmi, end);
  1201. init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
  1202. if (vma) {
  1203. mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
  1204. mt_on_stack(mt_detach);
  1205. mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
  1206. /* Prepare to unmap any existing mapping in the area */
  1207. error = vms_gather_munmap_vmas(&vms, &mas_detach);
  1208. if (error)
  1209. goto gather_failed;
  1210. vmg.next = vms.next;
  1211. vmg.prev = vms.prev;
  1212. vma = NULL;
  1213. } else {
  1214. vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
  1215. }
  1216. /* Check against address space limit. */
  1217. if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) {
  1218. error = -ENOMEM;
  1219. goto abort_munmap;
  1220. }
  1221. /*
  1222. * Private writable mapping: check memory availability
  1223. */
  1224. if (accountable_mapping(file, vm_flags)) {
  1225. charged = pglen;
  1226. charged -= vms.nr_accounted;
  1227. if (charged) {
  1228. error = security_vm_enough_memory_mm(mm, charged);
  1229. if (error)
  1230. goto abort_munmap;
  1231. }
  1232. vms.nr_accounted = 0;
  1233. vm_flags |= VM_ACCOUNT;
  1234. vmg.flags = vm_flags;
  1235. }
  1236. /*
  1237. * clear PTEs while the vma is still in the tree so that rmap
  1238. * cannot race with the freeing later in the truncate scenario.
  1239. * This is also needed for mmap_file(), which is why vm_ops
  1240. * close function is called.
  1241. */
  1242. vms_clean_up_area(&vms, &mas_detach);
  1243. vma = vma_merge_new_range(&vmg);
  1244. if (vma)
  1245. goto expanded;
  1246. /*
  1247. * Determine the object being mapped and call the appropriate
  1248. * specific mapper. the address has already been validated, but
  1249. * not unmapped, but the maps are removed from the list.
  1250. */
  1251. vma = vm_area_alloc(mm);
  1252. if (!vma) {
  1253. error = -ENOMEM;
  1254. goto unacct_error;
  1255. }
  1256. vma_iter_config(&vmi, addr, end);
  1257. vma_set_range(vma, addr, end, pgoff);
  1258. vm_flags_init(vma, vm_flags);
  1259. vma->vm_page_prot = vm_get_page_prot(vm_flags);
  1260. if (vma_iter_prealloc(&vmi, vma)) {
  1261. error = -ENOMEM;
  1262. goto free_vma;
  1263. }
  1264. if (file) {
  1265. vma->vm_file = get_file(file);
  1266. error = mmap_file(file, vma);
  1267. if (error)
  1268. goto unmap_and_free_file_vma;
  1269. /* Drivers cannot alter the address of the VMA. */
  1270. WARN_ON_ONCE(addr != vma->vm_start);
  1271. /*
  1272. * Drivers should not permit writability when previously it was
  1273. * disallowed.
  1274. */
  1275. VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
  1276. !(vm_flags & VM_MAYWRITE) &&
  1277. (vma->vm_flags & VM_MAYWRITE));
  1278. vma_iter_config(&vmi, addr, end);
  1279. /*
  1280. * If vm_flags changed after mmap_file(), we should try merge
  1281. * vma again as we may succeed this time.
  1282. */
  1283. if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
  1284. struct vm_area_struct *merge;
  1285. vmg.flags = vma->vm_flags;
  1286. /* If this fails, state is reset ready for a reattempt. */
  1287. merge = vma_merge_new_range(&vmg);
  1288. if (merge) {
  1289. /*
  1290. * ->mmap() can change vma->vm_file and fput
  1291. * the original file. So fput the vma->vm_file
  1292. * here or we would add an extra fput for file
  1293. * and cause general protection fault
  1294. * ultimately.
  1295. */
  1296. fput(vma->vm_file);
  1297. vm_area_free(vma);
  1298. vma = merge;
  1299. /* Update vm_flags to pick up the change. */
  1300. vm_flags = vma->vm_flags;
  1301. goto file_expanded;
  1302. }
  1303. /*
  1304. * In the unlikely even that more memory was needed, but
  1305. * not available for the vma merge, the vma iterator
  1306. * will have no memory reserved for the write we told
  1307. * the driver was happening. To keep up the ruse,
  1308. * ensure the allocation for the store succeeds.
  1309. */
  1310. if (vmg_nomem(&vmg)) {
  1311. mas_preallocate(&vmi.mas, vma,
  1312. GFP_KERNEL|__GFP_NOFAIL);
  1313. }
  1314. }
  1315. vm_flags = vma->vm_flags;
  1316. } else if (vm_flags & VM_SHARED) {
  1317. error = shmem_zero_setup(vma);
  1318. if (error)
  1319. goto free_iter_vma;
  1320. } else {
  1321. vma_set_anonymous(vma);
  1322. }
  1323. #ifdef CONFIG_SPARC64
  1324. /* TODO: Fix SPARC ADI! */
  1325. WARN_ON_ONCE(!arch_validate_flags(vm_flags));
  1326. #endif
  1327. /* Lock the VMA since it is modified after insertion into VMA tree */
  1328. vma_start_write(vma);
  1329. vma_iter_store(&vmi, vma);
  1330. mm->map_count++;
  1331. vma_link_file(vma);
  1332. /*
  1333. * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
  1334. * call covers the non-merge case.
  1335. */
  1336. khugepaged_enter_vma(vma, vma->vm_flags);
  1337. file_expanded:
  1338. file = vma->vm_file;
  1339. ksm_add_vma(vma);
  1340. expanded:
  1341. perf_event_mmap(vma);
  1342. /* Unmap any existing mapping in the area */
  1343. vms_complete_munmap_vmas(&vms, &mas_detach);
  1344. vm_stat_account(mm, vm_flags, pglen);
  1345. if (vm_flags & VM_LOCKED) {
  1346. if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
  1347. is_vm_hugetlb_page(vma) ||
  1348. vma == get_gate_vma(current->mm))
  1349. vm_flags_clear(vma, VM_LOCKED_MASK);
  1350. else
  1351. mm->locked_vm += pglen;
  1352. }
  1353. if (file)
  1354. uprobe_mmap(vma);
  1355. /*
  1356. * New (or expanded) vma always get soft dirty status.
  1357. * Otherwise user-space soft-dirty page tracker won't
  1358. * be able to distinguish situation when vma area unmapped,
  1359. * then new mapped in-place (which must be aimed as
  1360. * a completely new data area).
  1361. */
  1362. vm_flags_set(vma, VM_SOFTDIRTY);
  1363. vma_set_page_prot(vma);
  1364. return addr;
  1365. unmap_and_free_file_vma:
  1366. fput(vma->vm_file);
  1367. vma->vm_file = NULL;
  1368. vma_iter_set(&vmi, vma->vm_end);
  1369. /* Undo any partial mapping done by a device driver. */
  1370. unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
  1371. free_iter_vma:
  1372. vma_iter_free(&vmi);
  1373. free_vma:
  1374. vm_area_free(vma);
  1375. unacct_error:
  1376. if (charged)
  1377. vm_unacct_memory(charged);
  1378. abort_munmap:
  1379. vms_abort_munmap_vmas(&vms, &mas_detach);
  1380. gather_failed:
  1381. return error;
  1382. }
  1383. unsigned long mmap_region(struct file *file, unsigned long addr,
  1384. unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
  1385. struct list_head *uf)
  1386. {
  1387. unsigned long ret;
  1388. bool writable_file_mapping = false;
  1389. /* Check to see if MDWE is applicable. */
  1390. if (map_deny_write_exec(vm_flags, vm_flags))
  1391. return -EACCES;
  1392. /* Allow architectures to sanity-check the vm_flags. */
  1393. if (!arch_validate_flags(vm_flags))
  1394. return -EINVAL;
  1395. /* Map writable and ensure this isn't a sealed memfd. */
  1396. if (file && is_shared_maywrite(vm_flags)) {
  1397. int error = mapping_map_writable(file->f_mapping);
  1398. if (error)
  1399. return error;
  1400. writable_file_mapping = true;
  1401. }
  1402. ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
  1403. /* Clear our write mapping regardless of error. */
  1404. if (writable_file_mapping)
  1405. mapping_unmap_writable(file->f_mapping);
  1406. validate_mm(current->mm);
  1407. return ret;
  1408. }
  1409. static int __vm_munmap(unsigned long start, size_t len, bool unlock)
  1410. {
  1411. int ret;
  1412. struct mm_struct *mm = current->mm;
  1413. LIST_HEAD(uf);
  1414. VMA_ITERATOR(vmi, mm, start);
  1415. if (mmap_write_lock_killable(mm))
  1416. return -EINTR;
  1417. ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
  1418. if (ret || !unlock)
  1419. mmap_write_unlock(mm);
  1420. userfaultfd_unmap_complete(mm, &uf);
  1421. return ret;
  1422. }
  1423. int vm_munmap(unsigned long start, size_t len)
  1424. {
  1425. return __vm_munmap(start, len, false);
  1426. }
  1427. EXPORT_SYMBOL(vm_munmap);
  1428. SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
  1429. {
  1430. addr = untagged_addr(addr);
  1431. return __vm_munmap(addr, len, true);
  1432. }
  1433. /*
  1434. * Emulation of deprecated remap_file_pages() syscall.
  1435. */
  1436. SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
  1437. unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
  1438. {
  1439. struct mm_struct *mm = current->mm;
  1440. struct vm_area_struct *vma;
  1441. unsigned long populate = 0;
  1442. unsigned long ret = -EINVAL;
  1443. struct file *file;
  1444. vm_flags_t vm_flags;
  1445. pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
  1446. current->comm, current->pid);
  1447. if (prot)
  1448. return ret;
  1449. start = start & PAGE_MASK;
  1450. size = size & PAGE_MASK;
  1451. if (start + size <= start)
  1452. return ret;
  1453. /* Does pgoff wrap? */
  1454. if (pgoff + (size >> PAGE_SHIFT) < pgoff)
  1455. return ret;
  1456. if (mmap_read_lock_killable(mm))
  1457. return -EINTR;
  1458. /*
  1459. * Look up VMA under read lock first so we can perform the security
  1460. * without holding locks (which can be problematic). We reacquire a
  1461. * write lock later and check nothing changed underneath us.
  1462. */
  1463. vma = vma_lookup(mm, start);
  1464. if (!vma || !(vma->vm_flags & VM_SHARED)) {
  1465. mmap_read_unlock(mm);
  1466. return -EINVAL;
  1467. }
  1468. prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
  1469. prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
  1470. prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
  1471. flags &= MAP_NONBLOCK;
  1472. flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
  1473. if (vma->vm_flags & VM_LOCKED)
  1474. flags |= MAP_LOCKED;
  1475. /* Save vm_flags used to calculate prot and flags, and recheck later. */
  1476. vm_flags = vma->vm_flags;
  1477. file = get_file(vma->vm_file);
  1478. mmap_read_unlock(mm);
  1479. /* Call outside mmap_lock to be consistent with other callers. */
  1480. ret = security_mmap_file(file, prot, flags);
  1481. if (ret) {
  1482. fput(file);
  1483. return ret;
  1484. }
  1485. ret = -EINVAL;
  1486. /* OK security check passed, take write lock + let it rip. */
  1487. if (mmap_write_lock_killable(mm)) {
  1488. fput(file);
  1489. return -EINTR;
  1490. }
  1491. vma = vma_lookup(mm, start);
  1492. if (!vma)
  1493. goto out;
  1494. /* Make sure things didn't change under us. */
  1495. if (vma->vm_flags != vm_flags)
  1496. goto out;
  1497. if (vma->vm_file != file)
  1498. goto out;
  1499. if (start + size > vma->vm_end) {
  1500. VMA_ITERATOR(vmi, mm, vma->vm_end);
  1501. struct vm_area_struct *next, *prev = vma;
  1502. for_each_vma_range(vmi, next, start + size) {
  1503. /* hole between vmas ? */
  1504. if (next->vm_start != prev->vm_end)
  1505. goto out;
  1506. if (next->vm_file != vma->vm_file)
  1507. goto out;
  1508. if (next->vm_flags != vma->vm_flags)
  1509. goto out;
  1510. if (start + size <= next->vm_end)
  1511. break;
  1512. prev = next;
  1513. }
  1514. if (!next)
  1515. goto out;
  1516. }
  1517. ret = do_mmap(vma->vm_file, start, size,
  1518. prot, flags, 0, pgoff, &populate, NULL);
  1519. out:
  1520. mmap_write_unlock(mm);
  1521. fput(file);
  1522. if (populate)
  1523. mm_populate(ret, populate);
  1524. if (!IS_ERR_VALUE(ret))
  1525. ret = 0;
  1526. return ret;
  1527. }
  1528. /*
  1529. * do_brk_flags() - Increase the brk vma if the flags match.
  1530. * @vmi: The vma iterator
  1531. * @addr: The start address
  1532. * @len: The length of the increase
  1533. * @vma: The vma,
  1534. * @flags: The VMA Flags
  1535. *
  1536. * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
  1537. * do not match then create a new anonymous VMA. Eventually we may be able to
  1538. * do some brk-specific accounting here.
  1539. */
  1540. static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
  1541. unsigned long addr, unsigned long len, unsigned long flags)
  1542. {
  1543. struct mm_struct *mm = current->mm;
  1544. /*
  1545. * Check against address space limits by the changed size
  1546. * Note: This happens *after* clearing old mappings in some code paths.
  1547. */
  1548. flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
  1549. if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
  1550. return -ENOMEM;
  1551. if (mm->map_count > sysctl_max_map_count)
  1552. return -ENOMEM;
  1553. if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
  1554. return -ENOMEM;
  1555. /*
  1556. * Expand the existing vma if possible; Note that singular lists do not
  1557. * occur after forking, so the expand will only happen on new VMAs.
  1558. */
  1559. if (vma && vma->vm_end == addr) {
  1560. VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
  1561. vmg.prev = vma;
  1562. /* vmi is positioned at prev, which this mode expects. */
  1563. vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
  1564. if (vma_merge_new_range(&vmg))
  1565. goto out;
  1566. else if (vmg_nomem(&vmg))
  1567. goto unacct_fail;
  1568. }
  1569. if (vma)
  1570. vma_iter_next_range(vmi);
  1571. /* create a vma struct for an anonymous mapping */
  1572. vma = vm_area_alloc(mm);
  1573. if (!vma)
  1574. goto unacct_fail;
  1575. vma_set_anonymous(vma);
  1576. vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
  1577. vm_flags_init(vma, flags);
  1578. vma->vm_page_prot = vm_get_page_prot(flags);
  1579. vma_start_write(vma);
  1580. if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
  1581. goto mas_store_fail;
  1582. mm->map_count++;
  1583. validate_mm(mm);
  1584. ksm_add_vma(vma);
  1585. out:
  1586. perf_event_mmap(vma);
  1587. mm->total_vm += len >> PAGE_SHIFT;
  1588. mm->data_vm += len >> PAGE_SHIFT;
  1589. if (flags & VM_LOCKED)
  1590. mm->locked_vm += (len >> PAGE_SHIFT);
  1591. vm_flags_set(vma, VM_SOFTDIRTY);
  1592. return 0;
  1593. mas_store_fail:
  1594. vm_area_free(vma);
  1595. unacct_fail:
  1596. vm_unacct_memory(len >> PAGE_SHIFT);
  1597. return -ENOMEM;
  1598. }
  1599. int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
  1600. {
  1601. struct mm_struct *mm = current->mm;
  1602. struct vm_area_struct *vma = NULL;
  1603. unsigned long len;
  1604. int ret;
  1605. bool populate;
  1606. LIST_HEAD(uf);
  1607. VMA_ITERATOR(vmi, mm, addr);
  1608. len = PAGE_ALIGN(request);
  1609. if (len < request)
  1610. return -ENOMEM;
  1611. if (!len)
  1612. return 0;
  1613. /* Until we need other flags, refuse anything except VM_EXEC. */
  1614. if ((flags & (~VM_EXEC)) != 0)
  1615. return -EINVAL;
  1616. if (mmap_write_lock_killable(mm))
  1617. return -EINTR;
  1618. ret = check_brk_limits(addr, len);
  1619. if (ret)
  1620. goto limits_failed;
  1621. ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
  1622. if (ret)
  1623. goto munmap_failed;
  1624. vma = vma_prev(&vmi);
  1625. ret = do_brk_flags(&vmi, vma, addr, len, flags);
  1626. populate = ((mm->def_flags & VM_LOCKED) != 0);
  1627. mmap_write_unlock(mm);
  1628. userfaultfd_unmap_complete(mm, &uf);
  1629. if (populate && !ret)
  1630. mm_populate(addr, len);
  1631. return ret;
  1632. munmap_failed:
  1633. limits_failed:
  1634. mmap_write_unlock(mm);
  1635. return ret;
  1636. }
  1637. EXPORT_SYMBOL(vm_brk_flags);
  1638. /* Release all mmaps. */
  1639. void exit_mmap(struct mm_struct *mm)
  1640. {
  1641. struct mmu_gather tlb;
  1642. struct vm_area_struct *vma;
  1643. unsigned long nr_accounted = 0;
  1644. VMA_ITERATOR(vmi, mm, 0);
  1645. int count = 0;
  1646. /* mm's last user has gone, and its about to be pulled down */
  1647. mmu_notifier_release(mm);
  1648. mmap_read_lock(mm);
  1649. arch_exit_mmap(mm);
  1650. vma = vma_next(&vmi);
  1651. if (!vma || unlikely(xa_is_zero(vma))) {
  1652. /* Can happen if dup_mmap() received an OOM */
  1653. mmap_read_unlock(mm);
  1654. mmap_write_lock(mm);
  1655. goto destroy;
  1656. }
  1657. lru_add_drain();
  1658. flush_cache_mm(mm);
  1659. tlb_gather_mmu_fullmm(&tlb, mm);
  1660. /* update_hiwater_rss(mm) here? but nobody should be looking */
  1661. /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
  1662. unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
  1663. mmap_read_unlock(mm);
  1664. /*
  1665. * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
  1666. * because the memory has been already freed.
  1667. */
  1668. set_bit(MMF_OOM_SKIP, &mm->flags);
  1669. mmap_write_lock(mm);
  1670. mt_clear_in_rcu(&mm->mm_mt);
  1671. vma_iter_set(&vmi, vma->vm_end);
  1672. free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
  1673. USER_PGTABLES_CEILING, true);
  1674. tlb_finish_mmu(&tlb);
  1675. /*
  1676. * Walk the list again, actually closing and freeing it, with preemption
  1677. * enabled, without holding any MM locks besides the unreachable
  1678. * mmap_write_lock.
  1679. */
  1680. vma_iter_set(&vmi, vma->vm_end);
  1681. do {
  1682. if (vma->vm_flags & VM_ACCOUNT)
  1683. nr_accounted += vma_pages(vma);
  1684. remove_vma(vma, /* unreachable = */ true);
  1685. count++;
  1686. cond_resched();
  1687. vma = vma_next(&vmi);
  1688. } while (vma && likely(!xa_is_zero(vma)));
  1689. BUG_ON(count != mm->map_count);
  1690. trace_exit_mmap(mm);
  1691. destroy:
  1692. __mt_destroy(&mm->mm_mt);
  1693. mmap_write_unlock(mm);
  1694. vm_unacct_memory(nr_accounted);
  1695. }
  1696. /* Insert vm structure into process list sorted by address
  1697. * and into the inode's i_mmap tree. If vm_file is non-NULL
  1698. * then i_mmap_rwsem is taken here.
  1699. */
  1700. int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  1701. {
  1702. unsigned long charged = vma_pages(vma);
  1703. if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
  1704. return -ENOMEM;
  1705. if ((vma->vm_flags & VM_ACCOUNT) &&
  1706. security_vm_enough_memory_mm(mm, charged))
  1707. return -ENOMEM;
  1708. /*
  1709. * The vm_pgoff of a purely anonymous vma should be irrelevant
  1710. * until its first write fault, when page's anon_vma and index
  1711. * are set. But now set the vm_pgoff it will almost certainly
  1712. * end up with (unless mremap moves it elsewhere before that
  1713. * first wfault), so /proc/pid/maps tells a consistent story.
  1714. *
  1715. * By setting it to reflect the virtual start address of the
  1716. * vma, merges and splits can happen in a seamless way, just
  1717. * using the existing file pgoff checks and manipulations.
  1718. * Similarly in do_mmap and in do_brk_flags.
  1719. */
  1720. if (vma_is_anonymous(vma)) {
  1721. BUG_ON(vma->anon_vma);
  1722. vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  1723. }
  1724. if (vma_link(mm, vma)) {
  1725. if (vma->vm_flags & VM_ACCOUNT)
  1726. vm_unacct_memory(charged);
  1727. return -ENOMEM;
  1728. }
  1729. return 0;
  1730. }
  1731. /*
  1732. * Return true if the calling process may expand its vm space by the passed
  1733. * number of pages
  1734. */
  1735. bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
  1736. {
  1737. if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
  1738. return false;
  1739. if (is_data_mapping(flags) &&
  1740. mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
  1741. /* Workaround for Valgrind */
  1742. if (rlimit(RLIMIT_DATA) == 0 &&
  1743. mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
  1744. return true;
  1745. pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
  1746. current->comm, current->pid,
  1747. (mm->data_vm + npages) << PAGE_SHIFT,
  1748. rlimit(RLIMIT_DATA),
  1749. ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
  1750. if (!ignore_rlimit_data)
  1751. return false;
  1752. }
  1753. return true;
  1754. }
  1755. void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
  1756. {
  1757. WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
  1758. if (is_exec_mapping(flags))
  1759. mm->exec_vm += npages;
  1760. else if (is_stack_mapping(flags))
  1761. mm->stack_vm += npages;
  1762. else if (is_data_mapping(flags))
  1763. mm->data_vm += npages;
  1764. }
  1765. static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
  1766. /*
  1767. * Close hook, called for unmap() and on the old vma for mremap().
  1768. *
  1769. * Having a close hook prevents vma merging regardless of flags.
  1770. */
  1771. static void special_mapping_close(struct vm_area_struct *vma)
  1772. {
  1773. const struct vm_special_mapping *sm = vma->vm_private_data;
  1774. if (sm->close)
  1775. sm->close(sm, vma);
  1776. }
  1777. static const char *special_mapping_name(struct vm_area_struct *vma)
  1778. {
  1779. return ((struct vm_special_mapping *)vma->vm_private_data)->name;
  1780. }
  1781. static int special_mapping_mremap(struct vm_area_struct *new_vma)
  1782. {
  1783. struct vm_special_mapping *sm = new_vma->vm_private_data;
  1784. if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
  1785. return -EFAULT;
  1786. if (sm->mremap)
  1787. return sm->mremap(sm, new_vma);
  1788. return 0;
  1789. }
  1790. static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
  1791. {
  1792. /*
  1793. * Forbid splitting special mappings - kernel has expectations over
  1794. * the number of pages in mapping. Together with VM_DONTEXPAND
  1795. * the size of vma should stay the same over the special mapping's
  1796. * lifetime.
  1797. */
  1798. return -EINVAL;
  1799. }
  1800. static const struct vm_operations_struct special_mapping_vmops = {
  1801. .close = special_mapping_close,
  1802. .fault = special_mapping_fault,
  1803. .mremap = special_mapping_mremap,
  1804. .name = special_mapping_name,
  1805. /* vDSO code relies that VVAR can't be accessed remotely */
  1806. .access = NULL,
  1807. .may_split = special_mapping_split,
  1808. };
  1809. static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
  1810. {
  1811. struct vm_area_struct *vma = vmf->vma;
  1812. pgoff_t pgoff;
  1813. struct page **pages;
  1814. struct vm_special_mapping *sm = vma->vm_private_data;
  1815. if (sm->fault)
  1816. return sm->fault(sm, vmf->vma, vmf);
  1817. pages = sm->pages;
  1818. for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
  1819. pgoff--;
  1820. if (*pages) {
  1821. struct page *page = *pages;
  1822. get_page(page);
  1823. vmf->page = page;
  1824. return 0;
  1825. }
  1826. return VM_FAULT_SIGBUS;
  1827. }
  1828. static struct vm_area_struct *__install_special_mapping(
  1829. struct mm_struct *mm,
  1830. unsigned long addr, unsigned long len,
  1831. unsigned long vm_flags, void *priv,
  1832. const struct vm_operations_struct *ops)
  1833. {
  1834. int ret;
  1835. struct vm_area_struct *vma;
  1836. vma = vm_area_alloc(mm);
  1837. if (unlikely(vma == NULL))
  1838. return ERR_PTR(-ENOMEM);
  1839. vma_set_range(vma, addr, addr + len, 0);
  1840. vm_flags_init(vma, (vm_flags | mm->def_flags |
  1841. VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
  1842. vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
  1843. vma->vm_ops = ops;
  1844. vma->vm_private_data = priv;
  1845. ret = insert_vm_struct(mm, vma);
  1846. if (ret)
  1847. goto out;
  1848. vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
  1849. perf_event_mmap(vma);
  1850. return vma;
  1851. out:
  1852. vm_area_free(vma);
  1853. return ERR_PTR(ret);
  1854. }
  1855. bool vma_is_special_mapping(const struct vm_area_struct *vma,
  1856. const struct vm_special_mapping *sm)
  1857. {
  1858. return vma->vm_private_data == sm &&
  1859. vma->vm_ops == &special_mapping_vmops;
  1860. }
  1861. /*
  1862. * Called with mm->mmap_lock held for writing.
  1863. * Insert a new vma covering the given region, with the given flags.
  1864. * Its pages are supplied by the given array of struct page *.
  1865. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
  1866. * The region past the last page supplied will always produce SIGBUS.
  1867. * The array pointer and the pages it points to are assumed to stay alive
  1868. * for as long as this mapping might exist.
  1869. */
  1870. struct vm_area_struct *_install_special_mapping(
  1871. struct mm_struct *mm,
  1872. unsigned long addr, unsigned long len,
  1873. unsigned long vm_flags, const struct vm_special_mapping *spec)
  1874. {
  1875. return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
  1876. &special_mapping_vmops);
  1877. }
  1878. /*
  1879. * initialise the percpu counter for VM
  1880. */
  1881. void __init mmap_init(void)
  1882. {
  1883. int ret;
  1884. ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
  1885. VM_BUG_ON(ret);
  1886. }
  1887. /*
  1888. * Initialise sysctl_user_reserve_kbytes.
  1889. *
  1890. * This is intended to prevent a user from starting a single memory hogging
  1891. * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
  1892. * mode.
  1893. *
  1894. * The default value is min(3% of free memory, 128MB)
  1895. * 128MB is enough to recover with sshd/login, bash, and top/kill.
  1896. */
  1897. static int init_user_reserve(void)
  1898. {
  1899. unsigned long free_kbytes;
  1900. free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
  1901. sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
  1902. return 0;
  1903. }
  1904. subsys_initcall(init_user_reserve);
  1905. /*
  1906. * Initialise sysctl_admin_reserve_kbytes.
  1907. *
  1908. * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
  1909. * to log in and kill a memory hogging process.
  1910. *
  1911. * Systems with more than 256MB will reserve 8MB, enough to recover
  1912. * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
  1913. * only reserve 3% of free pages by default.
  1914. */
  1915. static int init_admin_reserve(void)
  1916. {
  1917. unsigned long free_kbytes;
  1918. free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
  1919. sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
  1920. return 0;
  1921. }
  1922. subsys_initcall(init_admin_reserve);
  1923. /*
  1924. * Reinititalise user and admin reserves if memory is added or removed.
  1925. *
  1926. * The default user reserve max is 128MB, and the default max for the
  1927. * admin reserve is 8MB. These are usually, but not always, enough to
  1928. * enable recovery from a memory hogging process using login/sshd, a shell,
  1929. * and tools like top. It may make sense to increase or even disable the
  1930. * reserve depending on the existence of swap or variations in the recovery
  1931. * tools. So, the admin may have changed them.
  1932. *
  1933. * If memory is added and the reserves have been eliminated or increased above
  1934. * the default max, then we'll trust the admin.
  1935. *
  1936. * If memory is removed and there isn't enough free memory, then we
  1937. * need to reset the reserves.
  1938. *
  1939. * Otherwise keep the reserve set by the admin.
  1940. */
  1941. static int reserve_mem_notifier(struct notifier_block *nb,
  1942. unsigned long action, void *data)
  1943. {
  1944. unsigned long tmp, free_kbytes;
  1945. switch (action) {
  1946. case MEM_ONLINE:
  1947. /* Default max is 128MB. Leave alone if modified by operator. */
  1948. tmp = sysctl_user_reserve_kbytes;
  1949. if (tmp > 0 && tmp < SZ_128K)
  1950. init_user_reserve();
  1951. /* Default max is 8MB. Leave alone if modified by operator. */
  1952. tmp = sysctl_admin_reserve_kbytes;
  1953. if (tmp > 0 && tmp < SZ_8K)
  1954. init_admin_reserve();
  1955. break;
  1956. case MEM_OFFLINE:
  1957. free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
  1958. if (sysctl_user_reserve_kbytes > free_kbytes) {
  1959. init_user_reserve();
  1960. pr_info("vm.user_reserve_kbytes reset to %lu\n",
  1961. sysctl_user_reserve_kbytes);
  1962. }
  1963. if (sysctl_admin_reserve_kbytes > free_kbytes) {
  1964. init_admin_reserve();
  1965. pr_info("vm.admin_reserve_kbytes reset to %lu\n",
  1966. sysctl_admin_reserve_kbytes);
  1967. }
  1968. break;
  1969. default:
  1970. break;
  1971. }
  1972. return NOTIFY_OK;
  1973. }
  1974. static int __meminit init_reserve_notifier(void)
  1975. {
  1976. if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
  1977. pr_err("Failed registering memory add/remove notifier for admin reserve\n");
  1978. return 0;
  1979. }
  1980. subsys_initcall(init_reserve_notifier);
  1981. /*
  1982. * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
  1983. * this VMA and its relocated range, which will now reside at [vma->vm_start -
  1984. * shift, vma->vm_end - shift).
  1985. *
  1986. * This function is almost certainly NOT what you want for anything other than
  1987. * early executable temporary stack relocation.
  1988. */
  1989. int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
  1990. {
  1991. /*
  1992. * The process proceeds as follows:
  1993. *
  1994. * 1) Use shift to calculate the new vma endpoints.
  1995. * 2) Extend vma to cover both the old and new ranges. This ensures the
  1996. * arguments passed to subsequent functions are consistent.
  1997. * 3) Move vma's page tables to the new range.
  1998. * 4) Free up any cleared pgd range.
  1999. * 5) Shrink the vma to cover only the new range.
  2000. */
  2001. struct mm_struct *mm = vma->vm_mm;
  2002. unsigned long old_start = vma->vm_start;
  2003. unsigned long old_end = vma->vm_end;
  2004. unsigned long length = old_end - old_start;
  2005. unsigned long new_start = old_start - shift;
  2006. unsigned long new_end = old_end - shift;
  2007. VMA_ITERATOR(vmi, mm, new_start);
  2008. VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
  2009. struct vm_area_struct *next;
  2010. struct mmu_gather tlb;
  2011. BUG_ON(new_start > new_end);
  2012. /*
  2013. * ensure there are no vmas between where we want to go
  2014. * and where we are
  2015. */
  2016. if (vma != vma_next(&vmi))
  2017. return -EFAULT;
  2018. vma_iter_prev_range(&vmi);
  2019. /*
  2020. * cover the whole range: [new_start, old_end)
  2021. */
  2022. vmg.vma = vma;
  2023. if (vma_expand(&vmg))
  2024. return -ENOMEM;
  2025. /*
  2026. * move the page tables downwards, on failure we rely on
  2027. * process cleanup to remove whatever mess we made.
  2028. */
  2029. if (length != move_page_tables(vma, old_start,
  2030. vma, new_start, length, false, true))
  2031. return -ENOMEM;
  2032. lru_add_drain();
  2033. tlb_gather_mmu(&tlb, mm);
  2034. next = vma_next(&vmi);
  2035. if (new_end > old_start) {
  2036. /*
  2037. * when the old and new regions overlap clear from new_end.
  2038. */
  2039. free_pgd_range(&tlb, new_end, old_end, new_end,
  2040. next ? next->vm_start : USER_PGTABLES_CEILING);
  2041. } else {
  2042. /*
  2043. * otherwise, clean from old_start; this is done to not touch
  2044. * the address space in [new_end, old_start) some architectures
  2045. * have constraints on va-space that make this illegal (IA64) -
  2046. * for the others its just a little faster.
  2047. */
  2048. free_pgd_range(&tlb, old_start, old_end, new_end,
  2049. next ? next->vm_start : USER_PGTABLES_CEILING);
  2050. }
  2051. tlb_finish_mmu(&tlb);
  2052. vma_prev(&vmi);
  2053. /* Shrink the vma to just the new range */
  2054. return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
  2055. }