rmap.c 80 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762
  1. /*
  2. * mm/rmap.c - physical to virtual reverse mappings
  3. *
  4. * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
  5. * Released under the General Public License (GPL).
  6. *
  7. * Simple, low overhead reverse mapping scheme.
  8. * Please try to keep this thing as modular as possible.
  9. *
  10. * Provides methods for unmapping each kind of mapped page:
  11. * the anon methods track anonymous pages, and
  12. * the file methods track pages belonging to an inode.
  13. *
  14. * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15. * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16. * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17. * Contributions by Hugh Dickins 2003, 2004
  18. */
  19. /*
  20. * Lock ordering in mm:
  21. *
  22. * inode->i_rwsem (while writing or truncating, not reading or faulting)
  23. * mm->mmap_lock
  24. * mapping->invalidate_lock (in filemap_fault)
  25. * folio_lock
  26. * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
  27. * vma_start_write
  28. * mapping->i_mmap_rwsem
  29. * anon_vma->rwsem
  30. * mm->page_table_lock or pte_lock
  31. * swap_lock (in swap_duplicate, swap_info_get)
  32. * mmlist_lock (in mmput, drain_mmlist and others)
  33. * mapping->private_lock (in block_dirty_folio)
  34. * folio_lock_memcg move_lock (in block_dirty_folio)
  35. * i_pages lock (widely used)
  36. * lruvec->lru_lock (in folio_lruvec_lock_irq)
  37. * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  38. * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  39. * sb_lock (within inode_lock in fs/fs-writeback.c)
  40. * i_pages lock (widely used, in set_page_dirty,
  41. * in arch-dependent flush_dcache_mmap_lock,
  42. * within bdi.wb->list_lock in __sync_single_inode)
  43. *
  44. * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
  45. * ->tasklist_lock
  46. * pte map lock
  47. *
  48. * hugetlbfs PageHuge() take locks in this order:
  49. * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
  50. * vma_lock (hugetlb specific lock for pmd_sharing)
  51. * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
  52. * folio_lock
  53. */
  54. #include <linux/mm.h>
  55. #include <linux/sched/mm.h>
  56. #include <linux/sched/task.h>
  57. #include <linux/pagemap.h>
  58. #include <linux/swap.h>
  59. #include <linux/swapops.h>
  60. #include <linux/slab.h>
  61. #include <linux/init.h>
  62. #include <linux/ksm.h>
  63. #include <linux/rmap.h>
  64. #include <linux/rcupdate.h>
  65. #include <linux/export.h>
  66. #include <linux/memcontrol.h>
  67. #include <linux/mmu_notifier.h>
  68. #include <linux/migrate.h>
  69. #include <linux/hugetlb.h>
  70. #include <linux/huge_mm.h>
  71. #include <linux/backing-dev.h>
  72. #include <linux/page_idle.h>
  73. #include <linux/memremap.h>
  74. #include <linux/userfaultfd_k.h>
  75. #include <linux/mm_inline.h>
  76. #include <linux/oom.h>
  77. #include <asm/tlbflush.h>
  78. #define CREATE_TRACE_POINTS
  79. #include <trace/events/tlb.h>
  80. #include <trace/events/migrate.h>
  81. #include "internal.h"
  82. static struct kmem_cache *anon_vma_cachep;
  83. static struct kmem_cache *anon_vma_chain_cachep;
  84. static inline struct anon_vma *anon_vma_alloc(void)
  85. {
  86. struct anon_vma *anon_vma;
  87. anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
  88. if (anon_vma) {
  89. atomic_set(&anon_vma->refcount, 1);
  90. anon_vma->num_children = 0;
  91. anon_vma->num_active_vmas = 0;
  92. anon_vma->parent = anon_vma;
  93. /*
  94. * Initialise the anon_vma root to point to itself. If called
  95. * from fork, the root will be reset to the parents anon_vma.
  96. */
  97. anon_vma->root = anon_vma;
  98. }
  99. return anon_vma;
  100. }
  101. static inline void anon_vma_free(struct anon_vma *anon_vma)
  102. {
  103. VM_BUG_ON(atomic_read(&anon_vma->refcount));
  104. /*
  105. * Synchronize against folio_lock_anon_vma_read() such that
  106. * we can safely hold the lock without the anon_vma getting
  107. * freed.
  108. *
  109. * Relies on the full mb implied by the atomic_dec_and_test() from
  110. * put_anon_vma() against the acquire barrier implied by
  111. * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
  112. *
  113. * folio_lock_anon_vma_read() VS put_anon_vma()
  114. * down_read_trylock() atomic_dec_and_test()
  115. * LOCK MB
  116. * atomic_read() rwsem_is_locked()
  117. *
  118. * LOCK should suffice since the actual taking of the lock must
  119. * happen _before_ what follows.
  120. */
  121. might_sleep();
  122. if (rwsem_is_locked(&anon_vma->root->rwsem)) {
  123. anon_vma_lock_write(anon_vma);
  124. anon_vma_unlock_write(anon_vma);
  125. }
  126. kmem_cache_free(anon_vma_cachep, anon_vma);
  127. }
  128. static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
  129. {
  130. return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
  131. }
  132. static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
  133. {
  134. kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
  135. }
  136. static void anon_vma_chain_link(struct vm_area_struct *vma,
  137. struct anon_vma_chain *avc,
  138. struct anon_vma *anon_vma)
  139. {
  140. avc->vma = vma;
  141. avc->anon_vma = anon_vma;
  142. list_add(&avc->same_vma, &vma->anon_vma_chain);
  143. anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
  144. }
  145. /**
  146. * __anon_vma_prepare - attach an anon_vma to a memory region
  147. * @vma: the memory region in question
  148. *
  149. * This makes sure the memory mapping described by 'vma' has
  150. * an 'anon_vma' attached to it, so that we can associate the
  151. * anonymous pages mapped into it with that anon_vma.
  152. *
  153. * The common case will be that we already have one, which
  154. * is handled inline by anon_vma_prepare(). But if
  155. * not we either need to find an adjacent mapping that we
  156. * can re-use the anon_vma from (very common when the only
  157. * reason for splitting a vma has been mprotect()), or we
  158. * allocate a new one.
  159. *
  160. * Anon-vma allocations are very subtle, because we may have
  161. * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
  162. * and that may actually touch the rwsem even in the newly
  163. * allocated vma (it depends on RCU to make sure that the
  164. * anon_vma isn't actually destroyed).
  165. *
  166. * As a result, we need to do proper anon_vma locking even
  167. * for the new allocation. At the same time, we do not want
  168. * to do any locking for the common case of already having
  169. * an anon_vma.
  170. */
  171. int __anon_vma_prepare(struct vm_area_struct *vma)
  172. {
  173. struct mm_struct *mm = vma->vm_mm;
  174. struct anon_vma *anon_vma, *allocated;
  175. struct anon_vma_chain *avc;
  176. mmap_assert_locked(mm);
  177. might_sleep();
  178. avc = anon_vma_chain_alloc(GFP_KERNEL);
  179. if (!avc)
  180. goto out_enomem;
  181. anon_vma = find_mergeable_anon_vma(vma);
  182. allocated = NULL;
  183. if (!anon_vma) {
  184. anon_vma = anon_vma_alloc();
  185. if (unlikely(!anon_vma))
  186. goto out_enomem_free_avc;
  187. anon_vma->num_children++; /* self-parent link for new root */
  188. allocated = anon_vma;
  189. }
  190. anon_vma_lock_write(anon_vma);
  191. /* page_table_lock to protect against threads */
  192. spin_lock(&mm->page_table_lock);
  193. if (likely(!vma->anon_vma)) {
  194. vma->anon_vma = anon_vma;
  195. anon_vma_chain_link(vma, avc, anon_vma);
  196. anon_vma->num_active_vmas++;
  197. allocated = NULL;
  198. avc = NULL;
  199. }
  200. spin_unlock(&mm->page_table_lock);
  201. anon_vma_unlock_write(anon_vma);
  202. if (unlikely(allocated))
  203. put_anon_vma(allocated);
  204. if (unlikely(avc))
  205. anon_vma_chain_free(avc);
  206. return 0;
  207. out_enomem_free_avc:
  208. anon_vma_chain_free(avc);
  209. out_enomem:
  210. return -ENOMEM;
  211. }
  212. /*
  213. * This is a useful helper function for locking the anon_vma root as
  214. * we traverse the vma->anon_vma_chain, looping over anon_vma's that
  215. * have the same vma.
  216. *
  217. * Such anon_vma's should have the same root, so you'd expect to see
  218. * just a single mutex_lock for the whole traversal.
  219. */
  220. static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
  221. {
  222. struct anon_vma *new_root = anon_vma->root;
  223. if (new_root != root) {
  224. if (WARN_ON_ONCE(root))
  225. up_write(&root->rwsem);
  226. root = new_root;
  227. down_write(&root->rwsem);
  228. }
  229. return root;
  230. }
  231. static inline void unlock_anon_vma_root(struct anon_vma *root)
  232. {
  233. if (root)
  234. up_write(&root->rwsem);
  235. }
  236. /*
  237. * Attach the anon_vmas from src to dst.
  238. * Returns 0 on success, -ENOMEM on failure.
  239. *
  240. * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
  241. * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
  242. * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
  243. * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
  244. * call, we can identify this case by checking (!dst->anon_vma &&
  245. * src->anon_vma).
  246. *
  247. * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
  248. * and reuse existing anon_vma which has no vmas and only one child anon_vma.
  249. * This prevents degradation of anon_vma hierarchy to endless linear chain in
  250. * case of constantly forking task. On the other hand, an anon_vma with more
  251. * than one child isn't reused even if there was no alive vma, thus rmap
  252. * walker has a good chance of avoiding scanning the whole hierarchy when it
  253. * searches where page is mapped.
  254. */
  255. int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
  256. {
  257. struct anon_vma_chain *avc, *pavc;
  258. struct anon_vma *root = NULL;
  259. list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
  260. struct anon_vma *anon_vma;
  261. avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
  262. if (unlikely(!avc)) {
  263. unlock_anon_vma_root(root);
  264. root = NULL;
  265. avc = anon_vma_chain_alloc(GFP_KERNEL);
  266. if (!avc)
  267. goto enomem_failure;
  268. }
  269. anon_vma = pavc->anon_vma;
  270. root = lock_anon_vma_root(root, anon_vma);
  271. anon_vma_chain_link(dst, avc, anon_vma);
  272. /*
  273. * Reuse existing anon_vma if it has no vma and only one
  274. * anon_vma child.
  275. *
  276. * Root anon_vma is never reused:
  277. * it has self-parent reference and at least one child.
  278. */
  279. if (!dst->anon_vma && src->anon_vma &&
  280. anon_vma->num_children < 2 &&
  281. anon_vma->num_active_vmas == 0)
  282. dst->anon_vma = anon_vma;
  283. }
  284. if (dst->anon_vma)
  285. dst->anon_vma->num_active_vmas++;
  286. unlock_anon_vma_root(root);
  287. return 0;
  288. enomem_failure:
  289. /*
  290. * dst->anon_vma is dropped here otherwise its num_active_vmas can
  291. * be incorrectly decremented in unlink_anon_vmas().
  292. * We can safely do this because callers of anon_vma_clone() don't care
  293. * about dst->anon_vma if anon_vma_clone() failed.
  294. */
  295. dst->anon_vma = NULL;
  296. unlink_anon_vmas(dst);
  297. return -ENOMEM;
  298. }
  299. /*
  300. * Attach vma to its own anon_vma, as well as to the anon_vmas that
  301. * the corresponding VMA in the parent process is attached to.
  302. * Returns 0 on success, non-zero on failure.
  303. */
  304. int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
  305. {
  306. struct anon_vma_chain *avc;
  307. struct anon_vma *anon_vma;
  308. int error;
  309. /* Don't bother if the parent process has no anon_vma here. */
  310. if (!pvma->anon_vma)
  311. return 0;
  312. /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
  313. vma->anon_vma = NULL;
  314. /*
  315. * First, attach the new VMA to the parent VMA's anon_vmas,
  316. * so rmap can find non-COWed pages in child processes.
  317. */
  318. error = anon_vma_clone(vma, pvma);
  319. if (error)
  320. return error;
  321. /* An existing anon_vma has been reused, all done then. */
  322. if (vma->anon_vma)
  323. return 0;
  324. /* Then add our own anon_vma. */
  325. anon_vma = anon_vma_alloc();
  326. if (!anon_vma)
  327. goto out_error;
  328. anon_vma->num_active_vmas++;
  329. avc = anon_vma_chain_alloc(GFP_KERNEL);
  330. if (!avc)
  331. goto out_error_free_anon_vma;
  332. /*
  333. * The root anon_vma's rwsem is the lock actually used when we
  334. * lock any of the anon_vmas in this anon_vma tree.
  335. */
  336. anon_vma->root = pvma->anon_vma->root;
  337. anon_vma->parent = pvma->anon_vma;
  338. /*
  339. * With refcounts, an anon_vma can stay around longer than the
  340. * process it belongs to. The root anon_vma needs to be pinned until
  341. * this anon_vma is freed, because the lock lives in the root.
  342. */
  343. get_anon_vma(anon_vma->root);
  344. /* Mark this anon_vma as the one where our new (COWed) pages go. */
  345. vma->anon_vma = anon_vma;
  346. anon_vma_lock_write(anon_vma);
  347. anon_vma_chain_link(vma, avc, anon_vma);
  348. anon_vma->parent->num_children++;
  349. anon_vma_unlock_write(anon_vma);
  350. return 0;
  351. out_error_free_anon_vma:
  352. put_anon_vma(anon_vma);
  353. out_error:
  354. unlink_anon_vmas(vma);
  355. return -ENOMEM;
  356. }
  357. void unlink_anon_vmas(struct vm_area_struct *vma)
  358. {
  359. struct anon_vma_chain *avc, *next;
  360. struct anon_vma *root = NULL;
  361. /*
  362. * Unlink each anon_vma chained to the VMA. This list is ordered
  363. * from newest to oldest, ensuring the root anon_vma gets freed last.
  364. */
  365. list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
  366. struct anon_vma *anon_vma = avc->anon_vma;
  367. root = lock_anon_vma_root(root, anon_vma);
  368. anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
  369. /*
  370. * Leave empty anon_vmas on the list - we'll need
  371. * to free them outside the lock.
  372. */
  373. if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
  374. anon_vma->parent->num_children--;
  375. continue;
  376. }
  377. list_del(&avc->same_vma);
  378. anon_vma_chain_free(avc);
  379. }
  380. if (vma->anon_vma) {
  381. vma->anon_vma->num_active_vmas--;
  382. /*
  383. * vma would still be needed after unlink, and anon_vma will be prepared
  384. * when handle fault.
  385. */
  386. vma->anon_vma = NULL;
  387. }
  388. unlock_anon_vma_root(root);
  389. /*
  390. * Iterate the list once more, it now only contains empty and unlinked
  391. * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
  392. * needing to write-acquire the anon_vma->root->rwsem.
  393. */
  394. list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
  395. struct anon_vma *anon_vma = avc->anon_vma;
  396. VM_WARN_ON(anon_vma->num_children);
  397. VM_WARN_ON(anon_vma->num_active_vmas);
  398. put_anon_vma(anon_vma);
  399. list_del(&avc->same_vma);
  400. anon_vma_chain_free(avc);
  401. }
  402. }
  403. static void anon_vma_ctor(void *data)
  404. {
  405. struct anon_vma *anon_vma = data;
  406. init_rwsem(&anon_vma->rwsem);
  407. atomic_set(&anon_vma->refcount, 0);
  408. anon_vma->rb_root = RB_ROOT_CACHED;
  409. }
  410. void __init anon_vma_init(void)
  411. {
  412. anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
  413. 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
  414. anon_vma_ctor);
  415. anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
  416. SLAB_PANIC|SLAB_ACCOUNT);
  417. }
  418. /*
  419. * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
  420. *
  421. * Since there is no serialization what so ever against folio_remove_rmap_*()
  422. * the best this function can do is return a refcount increased anon_vma
  423. * that might have been relevant to this page.
  424. *
  425. * The page might have been remapped to a different anon_vma or the anon_vma
  426. * returned may already be freed (and even reused).
  427. *
  428. * In case it was remapped to a different anon_vma, the new anon_vma will be a
  429. * child of the old anon_vma, and the anon_vma lifetime rules will therefore
  430. * ensure that any anon_vma obtained from the page will still be valid for as
  431. * long as we observe page_mapped() [ hence all those page_mapped() tests ].
  432. *
  433. * All users of this function must be very careful when walking the anon_vma
  434. * chain and verify that the page in question is indeed mapped in it
  435. * [ something equivalent to page_mapped_in_vma() ].
  436. *
  437. * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
  438. * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
  439. * if there is a mapcount, we can dereference the anon_vma after observing
  440. * those.
  441. *
  442. * NOTE: the caller should normally hold folio lock when calling this. If
  443. * not, the caller needs to double check the anon_vma didn't change after
  444. * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
  445. * concurrently without folio lock protection). See folio_lock_anon_vma_read()
  446. * which has already covered that, and comment above remap_pages().
  447. */
  448. struct anon_vma *folio_get_anon_vma(struct folio *folio)
  449. {
  450. struct anon_vma *anon_vma = NULL;
  451. unsigned long anon_mapping;
  452. rcu_read_lock();
  453. anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
  454. if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
  455. goto out;
  456. if (!folio_mapped(folio))
  457. goto out;
  458. anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  459. if (!atomic_inc_not_zero(&anon_vma->refcount)) {
  460. anon_vma = NULL;
  461. goto out;
  462. }
  463. /*
  464. * If this folio is still mapped, then its anon_vma cannot have been
  465. * freed. But if it has been unmapped, we have no security against the
  466. * anon_vma structure being freed and reused (for another anon_vma:
  467. * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
  468. * above cannot corrupt).
  469. */
  470. if (!folio_mapped(folio)) {
  471. rcu_read_unlock();
  472. put_anon_vma(anon_vma);
  473. return NULL;
  474. }
  475. out:
  476. rcu_read_unlock();
  477. return anon_vma;
  478. }
  479. /*
  480. * Similar to folio_get_anon_vma() except it locks the anon_vma.
  481. *
  482. * Its a little more complex as it tries to keep the fast path to a single
  483. * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
  484. * reference like with folio_get_anon_vma() and then block on the mutex
  485. * on !rwc->try_lock case.
  486. */
  487. struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
  488. struct rmap_walk_control *rwc)
  489. {
  490. struct anon_vma *anon_vma = NULL;
  491. struct anon_vma *root_anon_vma;
  492. unsigned long anon_mapping;
  493. retry:
  494. rcu_read_lock();
  495. anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
  496. if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
  497. goto out;
  498. if (!folio_mapped(folio))
  499. goto out;
  500. anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  501. root_anon_vma = READ_ONCE(anon_vma->root);
  502. if (down_read_trylock(&root_anon_vma->rwsem)) {
  503. /*
  504. * folio_move_anon_rmap() might have changed the anon_vma as we
  505. * might not hold the folio lock here.
  506. */
  507. if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
  508. anon_mapping)) {
  509. up_read(&root_anon_vma->rwsem);
  510. rcu_read_unlock();
  511. goto retry;
  512. }
  513. /*
  514. * If the folio is still mapped, then this anon_vma is still
  515. * its anon_vma, and holding the mutex ensures that it will
  516. * not go away, see anon_vma_free().
  517. */
  518. if (!folio_mapped(folio)) {
  519. up_read(&root_anon_vma->rwsem);
  520. anon_vma = NULL;
  521. }
  522. goto out;
  523. }
  524. if (rwc && rwc->try_lock) {
  525. anon_vma = NULL;
  526. rwc->contended = true;
  527. goto out;
  528. }
  529. /* trylock failed, we got to sleep */
  530. if (!atomic_inc_not_zero(&anon_vma->refcount)) {
  531. anon_vma = NULL;
  532. goto out;
  533. }
  534. if (!folio_mapped(folio)) {
  535. rcu_read_unlock();
  536. put_anon_vma(anon_vma);
  537. return NULL;
  538. }
  539. /* we pinned the anon_vma, its safe to sleep */
  540. rcu_read_unlock();
  541. anon_vma_lock_read(anon_vma);
  542. /*
  543. * folio_move_anon_rmap() might have changed the anon_vma as we might
  544. * not hold the folio lock here.
  545. */
  546. if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
  547. anon_mapping)) {
  548. anon_vma_unlock_read(anon_vma);
  549. put_anon_vma(anon_vma);
  550. anon_vma = NULL;
  551. goto retry;
  552. }
  553. if (atomic_dec_and_test(&anon_vma->refcount)) {
  554. /*
  555. * Oops, we held the last refcount, release the lock
  556. * and bail -- can't simply use put_anon_vma() because
  557. * we'll deadlock on the anon_vma_lock_write() recursion.
  558. */
  559. anon_vma_unlock_read(anon_vma);
  560. __put_anon_vma(anon_vma);
  561. anon_vma = NULL;
  562. }
  563. return anon_vma;
  564. out:
  565. rcu_read_unlock();
  566. return anon_vma;
  567. }
  568. #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  569. /*
  570. * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  571. * important if a PTE was dirty when it was unmapped that it's flushed
  572. * before any IO is initiated on the page to prevent lost writes. Similarly,
  573. * it must be flushed before freeing to prevent data leakage.
  574. */
  575. void try_to_unmap_flush(void)
  576. {
  577. struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
  578. if (!tlb_ubc->flush_required)
  579. return;
  580. arch_tlbbatch_flush(&tlb_ubc->arch);
  581. tlb_ubc->flush_required = false;
  582. tlb_ubc->writable = false;
  583. }
  584. /* Flush iff there are potentially writable TLB entries that can race with IO */
  585. void try_to_unmap_flush_dirty(void)
  586. {
  587. struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
  588. if (tlb_ubc->writable)
  589. try_to_unmap_flush();
  590. }
  591. /*
  592. * Bits 0-14 of mm->tlb_flush_batched record pending generations.
  593. * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
  594. */
  595. #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
  596. #define TLB_FLUSH_BATCH_PENDING_MASK \
  597. ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
  598. #define TLB_FLUSH_BATCH_PENDING_LARGE \
  599. (TLB_FLUSH_BATCH_PENDING_MASK / 2)
  600. static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
  601. unsigned long uaddr)
  602. {
  603. struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
  604. int batch;
  605. bool writable = pte_dirty(pteval);
  606. if (!pte_accessible(mm, pteval))
  607. return;
  608. arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
  609. tlb_ubc->flush_required = true;
  610. /*
  611. * Ensure compiler does not re-order the setting of tlb_flush_batched
  612. * before the PTE is cleared.
  613. */
  614. barrier();
  615. batch = atomic_read(&mm->tlb_flush_batched);
  616. retry:
  617. if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
  618. /*
  619. * Prevent `pending' from catching up with `flushed' because of
  620. * overflow. Reset `pending' and `flushed' to be 1 and 0 if
  621. * `pending' becomes large.
  622. */
  623. if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
  624. goto retry;
  625. } else {
  626. atomic_inc(&mm->tlb_flush_batched);
  627. }
  628. /*
  629. * If the PTE was dirty then it's best to assume it's writable. The
  630. * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
  631. * before the page is queued for IO.
  632. */
  633. if (writable)
  634. tlb_ubc->writable = true;
  635. }
  636. /*
  637. * Returns true if the TLB flush should be deferred to the end of a batch of
  638. * unmap operations to reduce IPIs.
  639. */
  640. static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
  641. {
  642. if (!(flags & TTU_BATCH_FLUSH))
  643. return false;
  644. return arch_tlbbatch_should_defer(mm);
  645. }
  646. /*
  647. * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
  648. * releasing the PTL if TLB flushes are batched. It's possible for a parallel
  649. * operation such as mprotect or munmap to race between reclaim unmapping
  650. * the page and flushing the page. If this race occurs, it potentially allows
  651. * access to data via a stale TLB entry. Tracking all mm's that have TLB
  652. * batching in flight would be expensive during reclaim so instead track
  653. * whether TLB batching occurred in the past and if so then do a flush here
  654. * if required. This will cost one additional flush per reclaim cycle paid
  655. * by the first operation at risk such as mprotect and mumap.
  656. *
  657. * This must be called under the PTL so that an access to tlb_flush_batched
  658. * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
  659. * via the PTL.
  660. */
  661. void flush_tlb_batched_pending(struct mm_struct *mm)
  662. {
  663. int batch = atomic_read(&mm->tlb_flush_batched);
  664. int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
  665. int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
  666. if (pending != flushed) {
  667. arch_flush_tlb_batched_pending(mm);
  668. /*
  669. * If the new TLB flushing is pending during flushing, leave
  670. * mm->tlb_flush_batched as is, to avoid losing flushing.
  671. */
  672. atomic_cmpxchg(&mm->tlb_flush_batched, batch,
  673. pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
  674. }
  675. }
  676. #else
  677. static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
  678. unsigned long uaddr)
  679. {
  680. }
  681. static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
  682. {
  683. return false;
  684. }
  685. #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  686. /*
  687. * At what user virtual address is page expected in vma?
  688. * Caller should check the page is actually part of the vma.
  689. */
  690. unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  691. {
  692. struct folio *folio = page_folio(page);
  693. pgoff_t pgoff;
  694. if (folio_test_anon(folio)) {
  695. struct anon_vma *page__anon_vma = folio_anon_vma(folio);
  696. /*
  697. * Note: swapoff's unuse_vma() is more efficient with this
  698. * check, and needs it to match anon_vma when KSM is active.
  699. */
  700. if (!vma->anon_vma || !page__anon_vma ||
  701. vma->anon_vma->root != page__anon_vma->root)
  702. return -EFAULT;
  703. } else if (!vma->vm_file) {
  704. return -EFAULT;
  705. } else if (vma->vm_file->f_mapping != folio->mapping) {
  706. return -EFAULT;
  707. }
  708. /* The !page__anon_vma above handles KSM folios */
  709. pgoff = folio->index + folio_page_idx(folio, page);
  710. return vma_address(vma, pgoff, 1);
  711. }
  712. /*
  713. * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
  714. * NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
  715. * represents.
  716. */
  717. pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
  718. {
  719. pgd_t *pgd;
  720. p4d_t *p4d;
  721. pud_t *pud;
  722. pmd_t *pmd = NULL;
  723. pgd = pgd_offset(mm, address);
  724. if (!pgd_present(*pgd))
  725. goto out;
  726. p4d = p4d_offset(pgd, address);
  727. if (!p4d_present(*p4d))
  728. goto out;
  729. pud = pud_offset(p4d, address);
  730. if (!pud_present(*pud))
  731. goto out;
  732. pmd = pmd_offset(pud, address);
  733. out:
  734. return pmd;
  735. }
  736. struct folio_referenced_arg {
  737. int mapcount;
  738. int referenced;
  739. unsigned long vm_flags;
  740. struct mem_cgroup *memcg;
  741. };
  742. /*
  743. * arg: folio_referenced_arg will be passed
  744. */
  745. static bool folio_referenced_one(struct folio *folio,
  746. struct vm_area_struct *vma, unsigned long address, void *arg)
  747. {
  748. struct folio_referenced_arg *pra = arg;
  749. DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
  750. int referenced = 0;
  751. unsigned long start = address, ptes = 0;
  752. while (page_vma_mapped_walk(&pvmw)) {
  753. address = pvmw.address;
  754. if (vma->vm_flags & VM_LOCKED) {
  755. if (!folio_test_large(folio) || !pvmw.pte) {
  756. /* Restore the mlock which got missed */
  757. mlock_vma_folio(folio, vma);
  758. page_vma_mapped_walk_done(&pvmw);
  759. pra->vm_flags |= VM_LOCKED;
  760. return false; /* To break the loop */
  761. }
  762. /*
  763. * For large folio fully mapped to VMA, will
  764. * be handled after the pvmw loop.
  765. *
  766. * For large folio cross VMA boundaries, it's
  767. * expected to be picked by page reclaim. But
  768. * should skip reference of pages which are in
  769. * the range of VM_LOCKED vma. As page reclaim
  770. * should just count the reference of pages out
  771. * the range of VM_LOCKED vma.
  772. */
  773. ptes++;
  774. pra->mapcount--;
  775. continue;
  776. }
  777. /*
  778. * Skip the non-shared swapbacked folio mapped solely by
  779. * the exiting or OOM-reaped process. This avoids redundant
  780. * swap-out followed by an immediate unmap.
  781. */
  782. if ((!atomic_read(&vma->vm_mm->mm_users) ||
  783. check_stable_address_space(vma->vm_mm)) &&
  784. folio_test_anon(folio) && folio_test_swapbacked(folio) &&
  785. !folio_likely_mapped_shared(folio)) {
  786. pra->referenced = -1;
  787. page_vma_mapped_walk_done(&pvmw);
  788. return false;
  789. }
  790. if (lru_gen_enabled() && pvmw.pte) {
  791. if (lru_gen_look_around(&pvmw))
  792. referenced++;
  793. } else if (pvmw.pte) {
  794. if (ptep_clear_flush_young_notify(vma, address,
  795. pvmw.pte))
  796. referenced++;
  797. } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
  798. if (pmdp_clear_flush_young_notify(vma, address,
  799. pvmw.pmd))
  800. referenced++;
  801. } else {
  802. /* unexpected pmd-mapped folio? */
  803. WARN_ON_ONCE(1);
  804. }
  805. pra->mapcount--;
  806. }
  807. if ((vma->vm_flags & VM_LOCKED) &&
  808. folio_test_large(folio) &&
  809. folio_within_vma(folio, vma)) {
  810. unsigned long s_align, e_align;
  811. s_align = ALIGN_DOWN(start, PMD_SIZE);
  812. e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
  813. /* folio doesn't cross page table boundary and fully mapped */
  814. if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
  815. /* Restore the mlock which got missed */
  816. mlock_vma_folio(folio, vma);
  817. pra->vm_flags |= VM_LOCKED;
  818. return false; /* To break the loop */
  819. }
  820. }
  821. if (referenced)
  822. folio_clear_idle(folio);
  823. if (folio_test_clear_young(folio))
  824. referenced++;
  825. if (referenced) {
  826. pra->referenced++;
  827. pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
  828. }
  829. if (!pra->mapcount)
  830. return false; /* To break the loop */
  831. return true;
  832. }
  833. static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
  834. {
  835. struct folio_referenced_arg *pra = arg;
  836. struct mem_cgroup *memcg = pra->memcg;
  837. /*
  838. * Ignore references from this mapping if it has no recency. If the
  839. * folio has been used in another mapping, we will catch it; if this
  840. * other mapping is already gone, the unmap path will have set the
  841. * referenced flag or activated the folio in zap_pte_range().
  842. */
  843. if (!vma_has_recency(vma))
  844. return true;
  845. /*
  846. * If we are reclaiming on behalf of a cgroup, skip counting on behalf
  847. * of references from different cgroups.
  848. */
  849. if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
  850. return true;
  851. return false;
  852. }
  853. /**
  854. * folio_referenced() - Test if the folio was referenced.
  855. * @folio: The folio to test.
  856. * @is_locked: Caller holds lock on the folio.
  857. * @memcg: target memory cgroup
  858. * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
  859. *
  860. * Quick test_and_clear_referenced for all mappings of a folio,
  861. *
  862. * Return: The number of mappings which referenced the folio. Return -1 if
  863. * the function bailed out due to rmap lock contention.
  864. */
  865. int folio_referenced(struct folio *folio, int is_locked,
  866. struct mem_cgroup *memcg, unsigned long *vm_flags)
  867. {
  868. bool we_locked = false;
  869. struct folio_referenced_arg pra = {
  870. .mapcount = folio_mapcount(folio),
  871. .memcg = memcg,
  872. };
  873. struct rmap_walk_control rwc = {
  874. .rmap_one = folio_referenced_one,
  875. .arg = (void *)&pra,
  876. .anon_lock = folio_lock_anon_vma_read,
  877. .try_lock = true,
  878. .invalid_vma = invalid_folio_referenced_vma,
  879. };
  880. *vm_flags = 0;
  881. if (!pra.mapcount)
  882. return 0;
  883. if (!folio_raw_mapping(folio))
  884. return 0;
  885. if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
  886. we_locked = folio_trylock(folio);
  887. if (!we_locked)
  888. return 1;
  889. }
  890. rmap_walk(folio, &rwc);
  891. *vm_flags = pra.vm_flags;
  892. if (we_locked)
  893. folio_unlock(folio);
  894. return rwc.contended ? -1 : pra.referenced;
  895. }
  896. static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
  897. {
  898. int cleaned = 0;
  899. struct vm_area_struct *vma = pvmw->vma;
  900. struct mmu_notifier_range range;
  901. unsigned long address = pvmw->address;
  902. /*
  903. * We have to assume the worse case ie pmd for invalidation. Note that
  904. * the folio can not be freed from this function.
  905. */
  906. mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
  907. vma->vm_mm, address, vma_address_end(pvmw));
  908. mmu_notifier_invalidate_range_start(&range);
  909. while (page_vma_mapped_walk(pvmw)) {
  910. int ret = 0;
  911. address = pvmw->address;
  912. if (pvmw->pte) {
  913. pte_t *pte = pvmw->pte;
  914. pte_t entry = ptep_get(pte);
  915. if (!pte_dirty(entry) && !pte_write(entry))
  916. continue;
  917. flush_cache_page(vma, address, pte_pfn(entry));
  918. entry = ptep_clear_flush(vma, address, pte);
  919. entry = pte_wrprotect(entry);
  920. entry = pte_mkclean(entry);
  921. set_pte_at(vma->vm_mm, address, pte, entry);
  922. ret = 1;
  923. } else {
  924. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  925. pmd_t *pmd = pvmw->pmd;
  926. pmd_t entry;
  927. if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
  928. continue;
  929. flush_cache_range(vma, address,
  930. address + HPAGE_PMD_SIZE);
  931. entry = pmdp_invalidate(vma, address, pmd);
  932. entry = pmd_wrprotect(entry);
  933. entry = pmd_mkclean(entry);
  934. set_pmd_at(vma->vm_mm, address, pmd, entry);
  935. ret = 1;
  936. #else
  937. /* unexpected pmd-mapped folio? */
  938. WARN_ON_ONCE(1);
  939. #endif
  940. }
  941. if (ret)
  942. cleaned++;
  943. }
  944. mmu_notifier_invalidate_range_end(&range);
  945. return cleaned;
  946. }
  947. static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
  948. unsigned long address, void *arg)
  949. {
  950. DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
  951. int *cleaned = arg;
  952. *cleaned += page_vma_mkclean_one(&pvmw);
  953. return true;
  954. }
  955. static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
  956. {
  957. if (vma->vm_flags & VM_SHARED)
  958. return false;
  959. return true;
  960. }
  961. int folio_mkclean(struct folio *folio)
  962. {
  963. int cleaned = 0;
  964. struct address_space *mapping;
  965. struct rmap_walk_control rwc = {
  966. .arg = (void *)&cleaned,
  967. .rmap_one = page_mkclean_one,
  968. .invalid_vma = invalid_mkclean_vma,
  969. };
  970. BUG_ON(!folio_test_locked(folio));
  971. if (!folio_mapped(folio))
  972. return 0;
  973. mapping = folio_mapping(folio);
  974. if (!mapping)
  975. return 0;
  976. rmap_walk(folio, &rwc);
  977. return cleaned;
  978. }
  979. EXPORT_SYMBOL_GPL(folio_mkclean);
  980. /**
  981. * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
  982. * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
  983. * within the @vma of shared mappings. And since clean PTEs
  984. * should also be readonly, write protects them too.
  985. * @pfn: start pfn.
  986. * @nr_pages: number of physically contiguous pages srarting with @pfn.
  987. * @pgoff: page offset that the @pfn mapped with.
  988. * @vma: vma that @pfn mapped within.
  989. *
  990. * Returns the number of cleaned PTEs (including PMDs).
  991. */
  992. int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
  993. struct vm_area_struct *vma)
  994. {
  995. struct page_vma_mapped_walk pvmw = {
  996. .pfn = pfn,
  997. .nr_pages = nr_pages,
  998. .pgoff = pgoff,
  999. .vma = vma,
  1000. .flags = PVMW_SYNC,
  1001. };
  1002. if (invalid_mkclean_vma(vma, NULL))
  1003. return 0;
  1004. pvmw.address = vma_address(vma, pgoff, nr_pages);
  1005. VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
  1006. return page_vma_mkclean_one(&pvmw);
  1007. }
  1008. static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
  1009. struct page *page, int nr_pages, enum rmap_level level,
  1010. int *nr_pmdmapped)
  1011. {
  1012. atomic_t *mapped = &folio->_nr_pages_mapped;
  1013. const int orig_nr_pages = nr_pages;
  1014. int first = 0, nr = 0;
  1015. __folio_rmap_sanity_checks(folio, page, nr_pages, level);
  1016. switch (level) {
  1017. case RMAP_LEVEL_PTE:
  1018. if (!folio_test_large(folio)) {
  1019. nr = atomic_inc_and_test(&folio->_mapcount);
  1020. break;
  1021. }
  1022. do {
  1023. first += atomic_inc_and_test(&page->_mapcount);
  1024. } while (page++, --nr_pages > 0);
  1025. if (first &&
  1026. atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
  1027. nr = first;
  1028. atomic_add(orig_nr_pages, &folio->_large_mapcount);
  1029. break;
  1030. case RMAP_LEVEL_PMD:
  1031. first = atomic_inc_and_test(&folio->_entire_mapcount);
  1032. if (first) {
  1033. nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
  1034. if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
  1035. *nr_pmdmapped = folio_nr_pages(folio);
  1036. nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
  1037. /* Raced ahead of a remove and another add? */
  1038. if (unlikely(nr < 0))
  1039. nr = 0;
  1040. } else {
  1041. /* Raced ahead of a remove of ENTIRELY_MAPPED */
  1042. nr = 0;
  1043. }
  1044. }
  1045. atomic_inc(&folio->_large_mapcount);
  1046. break;
  1047. }
  1048. return nr;
  1049. }
  1050. /**
  1051. * folio_move_anon_rmap - move a folio to our anon_vma
  1052. * @folio: The folio to move to our anon_vma
  1053. * @vma: The vma the folio belongs to
  1054. *
  1055. * When a folio belongs exclusively to one process after a COW event,
  1056. * that folio can be moved into the anon_vma that belongs to just that
  1057. * process, so the rmap code will not search the parent or sibling processes.
  1058. */
  1059. void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
  1060. {
  1061. void *anon_vma = vma->anon_vma;
  1062. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1063. VM_BUG_ON_VMA(!anon_vma, vma);
  1064. anon_vma += PAGE_MAPPING_ANON;
  1065. /*
  1066. * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
  1067. * simultaneously, so a concurrent reader (eg folio_referenced()'s
  1068. * folio_test_anon()) will not see one without the other.
  1069. */
  1070. WRITE_ONCE(folio->mapping, anon_vma);
  1071. }
  1072. /**
  1073. * __folio_set_anon - set up a new anonymous rmap for a folio
  1074. * @folio: The folio to set up the new anonymous rmap for.
  1075. * @vma: VM area to add the folio to.
  1076. * @address: User virtual address of the mapping
  1077. * @exclusive: Whether the folio is exclusive to the process.
  1078. */
  1079. static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
  1080. unsigned long address, bool exclusive)
  1081. {
  1082. struct anon_vma *anon_vma = vma->anon_vma;
  1083. BUG_ON(!anon_vma);
  1084. /*
  1085. * If the folio isn't exclusive to this vma, we must use the _oldest_
  1086. * possible anon_vma for the folio mapping!
  1087. */
  1088. if (!exclusive)
  1089. anon_vma = anon_vma->root;
  1090. /*
  1091. * page_idle does a lockless/optimistic rmap scan on folio->mapping.
  1092. * Make sure the compiler doesn't split the stores of anon_vma and
  1093. * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
  1094. * could mistake the mapping for a struct address_space and crash.
  1095. */
  1096. anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  1097. WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
  1098. folio->index = linear_page_index(vma, address);
  1099. }
  1100. /**
  1101. * __page_check_anon_rmap - sanity check anonymous rmap addition
  1102. * @folio: The folio containing @page.
  1103. * @page: the page to check the mapping of
  1104. * @vma: the vm area in which the mapping is added
  1105. * @address: the user virtual address mapped
  1106. */
  1107. static void __page_check_anon_rmap(struct folio *folio, struct page *page,
  1108. struct vm_area_struct *vma, unsigned long address)
  1109. {
  1110. /*
  1111. * The page's anon-rmap details (mapping and index) are guaranteed to
  1112. * be set up correctly at this point.
  1113. *
  1114. * We have exclusion against folio_add_anon_rmap_*() because the caller
  1115. * always holds the page locked.
  1116. *
  1117. * We have exclusion against folio_add_new_anon_rmap because those pages
  1118. * are initially only visible via the pagetables, and the pte is locked
  1119. * over the call to folio_add_new_anon_rmap.
  1120. */
  1121. VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
  1122. folio);
  1123. VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
  1124. page);
  1125. }
  1126. static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
  1127. {
  1128. int idx;
  1129. if (nr) {
  1130. idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
  1131. __lruvec_stat_mod_folio(folio, idx, nr);
  1132. }
  1133. if (nr_pmdmapped) {
  1134. if (folio_test_anon(folio)) {
  1135. idx = NR_ANON_THPS;
  1136. __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
  1137. } else {
  1138. /* NR_*_PMDMAPPED are not maintained per-memcg */
  1139. idx = folio_test_swapbacked(folio) ?
  1140. NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
  1141. __mod_node_page_state(folio_pgdat(folio), idx,
  1142. nr_pmdmapped);
  1143. }
  1144. }
  1145. }
  1146. static __always_inline void __folio_add_anon_rmap(struct folio *folio,
  1147. struct page *page, int nr_pages, struct vm_area_struct *vma,
  1148. unsigned long address, rmap_t flags, enum rmap_level level)
  1149. {
  1150. int i, nr, nr_pmdmapped = 0;
  1151. VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
  1152. nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
  1153. if (likely(!folio_test_ksm(folio)))
  1154. __page_check_anon_rmap(folio, page, vma, address);
  1155. __folio_mod_stat(folio, nr, nr_pmdmapped);
  1156. if (flags & RMAP_EXCLUSIVE) {
  1157. switch (level) {
  1158. case RMAP_LEVEL_PTE:
  1159. for (i = 0; i < nr_pages; i++)
  1160. SetPageAnonExclusive(page + i);
  1161. break;
  1162. case RMAP_LEVEL_PMD:
  1163. SetPageAnonExclusive(page);
  1164. break;
  1165. }
  1166. }
  1167. for (i = 0; i < nr_pages; i++) {
  1168. struct page *cur_page = page + i;
  1169. /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
  1170. VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
  1171. (folio_test_large(folio) &&
  1172. folio_entire_mapcount(folio) > 1)) &&
  1173. PageAnonExclusive(cur_page), folio);
  1174. }
  1175. /*
  1176. * For large folio, only mlock it if it's fully mapped to VMA. It's
  1177. * not easy to check whether the large folio is fully mapped to VMA
  1178. * here. Only mlock normal 4K folio and leave page reclaim to handle
  1179. * large folio.
  1180. */
  1181. if (!folio_test_large(folio))
  1182. mlock_vma_folio(folio, vma);
  1183. }
  1184. /**
  1185. * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
  1186. * @folio: The folio to add the mappings to
  1187. * @page: The first page to add
  1188. * @nr_pages: The number of pages which will be mapped
  1189. * @vma: The vm area in which the mappings are added
  1190. * @address: The user virtual address of the first page to map
  1191. * @flags: The rmap flags
  1192. *
  1193. * The page range of folio is defined by [first_page, first_page + nr_pages)
  1194. *
  1195. * The caller needs to hold the page table lock, and the page must be locked in
  1196. * the anon_vma case: to serialize mapping,index checking after setting,
  1197. * and to ensure that an anon folio is not being upgraded racily to a KSM folio
  1198. * (but KSM folios are never downgraded).
  1199. */
  1200. void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
  1201. int nr_pages, struct vm_area_struct *vma, unsigned long address,
  1202. rmap_t flags)
  1203. {
  1204. __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
  1205. RMAP_LEVEL_PTE);
  1206. }
  1207. /**
  1208. * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
  1209. * @folio: The folio to add the mapping to
  1210. * @page: The first page to add
  1211. * @vma: The vm area in which the mapping is added
  1212. * @address: The user virtual address of the first page to map
  1213. * @flags: The rmap flags
  1214. *
  1215. * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
  1216. *
  1217. * The caller needs to hold the page table lock, and the page must be locked in
  1218. * the anon_vma case: to serialize mapping,index checking after setting.
  1219. */
  1220. void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
  1221. struct vm_area_struct *vma, unsigned long address, rmap_t flags)
  1222. {
  1223. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1224. __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
  1225. RMAP_LEVEL_PMD);
  1226. #else
  1227. WARN_ON_ONCE(true);
  1228. #endif
  1229. }
  1230. /**
  1231. * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
  1232. * @folio: The folio to add the mapping to.
  1233. * @vma: the vm area in which the mapping is added
  1234. * @address: the user virtual address mapped
  1235. * @flags: The rmap flags
  1236. *
  1237. * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
  1238. * This means the inc-and-test can be bypassed.
  1239. * The folio doesn't necessarily need to be locked while it's exclusive
  1240. * unless two threads map it concurrently. However, the folio must be
  1241. * locked if it's shared.
  1242. *
  1243. * If the folio is pmd-mappable, it is accounted as a THP.
  1244. */
  1245. void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
  1246. unsigned long address, rmap_t flags)
  1247. {
  1248. const int nr = folio_nr_pages(folio);
  1249. const bool exclusive = flags & RMAP_EXCLUSIVE;
  1250. int nr_pmdmapped = 0;
  1251. VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
  1252. VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
  1253. VM_BUG_ON_VMA(address < vma->vm_start ||
  1254. address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
  1255. /*
  1256. * VM_DROPPABLE mappings don't swap; instead they're just dropped when
  1257. * under memory pressure.
  1258. */
  1259. if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
  1260. __folio_set_swapbacked(folio);
  1261. __folio_set_anon(folio, vma, address, exclusive);
  1262. if (likely(!folio_test_large(folio))) {
  1263. /* increment count (starts at -1) */
  1264. atomic_set(&folio->_mapcount, 0);
  1265. if (exclusive)
  1266. SetPageAnonExclusive(&folio->page);
  1267. } else if (!folio_test_pmd_mappable(folio)) {
  1268. int i;
  1269. for (i = 0; i < nr; i++) {
  1270. struct page *page = folio_page(folio, i);
  1271. /* increment count (starts at -1) */
  1272. atomic_set(&page->_mapcount, 0);
  1273. if (exclusive)
  1274. SetPageAnonExclusive(page);
  1275. }
  1276. /* increment count (starts at -1) */
  1277. atomic_set(&folio->_large_mapcount, nr - 1);
  1278. atomic_set(&folio->_nr_pages_mapped, nr);
  1279. } else {
  1280. /* increment count (starts at -1) */
  1281. atomic_set(&folio->_entire_mapcount, 0);
  1282. /* increment count (starts at -1) */
  1283. atomic_set(&folio->_large_mapcount, 0);
  1284. atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
  1285. if (exclusive)
  1286. SetPageAnonExclusive(&folio->page);
  1287. nr_pmdmapped = nr;
  1288. }
  1289. __folio_mod_stat(folio, nr, nr_pmdmapped);
  1290. mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
  1291. }
  1292. static __always_inline void __folio_add_file_rmap(struct folio *folio,
  1293. struct page *page, int nr_pages, struct vm_area_struct *vma,
  1294. enum rmap_level level)
  1295. {
  1296. int nr, nr_pmdmapped = 0;
  1297. VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
  1298. nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
  1299. __folio_mod_stat(folio, nr, nr_pmdmapped);
  1300. /* See comments in folio_add_anon_rmap_*() */
  1301. if (!folio_test_large(folio))
  1302. mlock_vma_folio(folio, vma);
  1303. }
  1304. /**
  1305. * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
  1306. * @folio: The folio to add the mappings to
  1307. * @page: The first page to add
  1308. * @nr_pages: The number of pages that will be mapped using PTEs
  1309. * @vma: The vm area in which the mappings are added
  1310. *
  1311. * The page range of the folio is defined by [page, page + nr_pages)
  1312. *
  1313. * The caller needs to hold the page table lock.
  1314. */
  1315. void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
  1316. int nr_pages, struct vm_area_struct *vma)
  1317. {
  1318. __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
  1319. }
  1320. /**
  1321. * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
  1322. * @folio: The folio to add the mapping to
  1323. * @page: The first page to add
  1324. * @vma: The vm area in which the mapping is added
  1325. *
  1326. * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
  1327. *
  1328. * The caller needs to hold the page table lock.
  1329. */
  1330. void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
  1331. struct vm_area_struct *vma)
  1332. {
  1333. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1334. __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
  1335. #else
  1336. WARN_ON_ONCE(true);
  1337. #endif
  1338. }
  1339. static __always_inline void __folio_remove_rmap(struct folio *folio,
  1340. struct page *page, int nr_pages, struct vm_area_struct *vma,
  1341. enum rmap_level level)
  1342. {
  1343. atomic_t *mapped = &folio->_nr_pages_mapped;
  1344. int last = 0, nr = 0, nr_pmdmapped = 0;
  1345. bool partially_mapped = false;
  1346. __folio_rmap_sanity_checks(folio, page, nr_pages, level);
  1347. switch (level) {
  1348. case RMAP_LEVEL_PTE:
  1349. if (!folio_test_large(folio)) {
  1350. nr = atomic_add_negative(-1, &folio->_mapcount);
  1351. break;
  1352. }
  1353. atomic_sub(nr_pages, &folio->_large_mapcount);
  1354. do {
  1355. last += atomic_add_negative(-1, &page->_mapcount);
  1356. } while (page++, --nr_pages > 0);
  1357. if (last &&
  1358. atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
  1359. nr = last;
  1360. partially_mapped = nr && atomic_read(mapped);
  1361. break;
  1362. case RMAP_LEVEL_PMD:
  1363. atomic_dec(&folio->_large_mapcount);
  1364. last = atomic_add_negative(-1, &folio->_entire_mapcount);
  1365. if (last) {
  1366. nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
  1367. if (likely(nr < ENTIRELY_MAPPED)) {
  1368. nr_pmdmapped = folio_nr_pages(folio);
  1369. nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
  1370. /* Raced ahead of another remove and an add? */
  1371. if (unlikely(nr < 0))
  1372. nr = 0;
  1373. } else {
  1374. /* An add of ENTIRELY_MAPPED raced ahead */
  1375. nr = 0;
  1376. }
  1377. }
  1378. partially_mapped = nr && nr < nr_pmdmapped;
  1379. break;
  1380. }
  1381. /*
  1382. * Queue anon large folio for deferred split if at least one page of
  1383. * the folio is unmapped and at least one page is still mapped.
  1384. *
  1385. * Check partially_mapped first to ensure it is a large folio.
  1386. */
  1387. if (partially_mapped && folio_test_anon(folio) &&
  1388. !folio_test_partially_mapped(folio))
  1389. deferred_split_folio(folio, true);
  1390. __folio_mod_stat(folio, -nr, -nr_pmdmapped);
  1391. /*
  1392. * It would be tidy to reset folio_test_anon mapping when fully
  1393. * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
  1394. * which increments mapcount after us but sets mapping before us:
  1395. * so leave the reset to free_pages_prepare, and remember that
  1396. * it's only reliable while mapped.
  1397. */
  1398. munlock_vma_folio(folio, vma);
  1399. }
  1400. /**
  1401. * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
  1402. * @folio: The folio to remove the mappings from
  1403. * @page: The first page to remove
  1404. * @nr_pages: The number of pages that will be removed from the mapping
  1405. * @vma: The vm area from which the mappings are removed
  1406. *
  1407. * The page range of the folio is defined by [page, page + nr_pages)
  1408. *
  1409. * The caller needs to hold the page table lock.
  1410. */
  1411. void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
  1412. int nr_pages, struct vm_area_struct *vma)
  1413. {
  1414. __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
  1415. }
  1416. /**
  1417. * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
  1418. * @folio: The folio to remove the mapping from
  1419. * @page: The first page to remove
  1420. * @vma: The vm area from which the mapping is removed
  1421. *
  1422. * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
  1423. *
  1424. * The caller needs to hold the page table lock.
  1425. */
  1426. void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
  1427. struct vm_area_struct *vma)
  1428. {
  1429. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1430. __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
  1431. #else
  1432. WARN_ON_ONCE(true);
  1433. #endif
  1434. }
  1435. /*
  1436. * @arg: enum ttu_flags will be passed to this argument
  1437. */
  1438. static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
  1439. unsigned long address, void *arg)
  1440. {
  1441. struct mm_struct *mm = vma->vm_mm;
  1442. DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
  1443. pte_t pteval;
  1444. struct page *subpage;
  1445. bool anon_exclusive, ret = true;
  1446. struct mmu_notifier_range range;
  1447. enum ttu_flags flags = (enum ttu_flags)(long)arg;
  1448. unsigned long pfn;
  1449. unsigned long hsz = 0;
  1450. /*
  1451. * When racing against e.g. zap_pte_range() on another cpu,
  1452. * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
  1453. * try_to_unmap() may return before page_mapped() has become false,
  1454. * if page table locking is skipped: use TTU_SYNC to wait for that.
  1455. */
  1456. if (flags & TTU_SYNC)
  1457. pvmw.flags = PVMW_SYNC;
  1458. /*
  1459. * For THP, we have to assume the worse case ie pmd for invalidation.
  1460. * For hugetlb, it could be much worse if we need to do pud
  1461. * invalidation in the case of pmd sharing.
  1462. *
  1463. * Note that the folio can not be freed in this function as call of
  1464. * try_to_unmap() must hold a reference on the folio.
  1465. */
  1466. range.end = vma_address_end(&pvmw);
  1467. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  1468. address, range.end);
  1469. if (folio_test_hugetlb(folio)) {
  1470. /*
  1471. * If sharing is possible, start and end will be adjusted
  1472. * accordingly.
  1473. */
  1474. adjust_range_if_pmd_sharing_possible(vma, &range.start,
  1475. &range.end);
  1476. /* We need the huge page size for set_huge_pte_at() */
  1477. hsz = huge_page_size(hstate_vma(vma));
  1478. }
  1479. mmu_notifier_invalidate_range_start(&range);
  1480. while (page_vma_mapped_walk(&pvmw)) {
  1481. /*
  1482. * If the folio is in an mlock()d vma, we must not swap it out.
  1483. */
  1484. if (!(flags & TTU_IGNORE_MLOCK) &&
  1485. (vma->vm_flags & VM_LOCKED)) {
  1486. /* Restore the mlock which got missed */
  1487. if (!folio_test_large(folio))
  1488. mlock_vma_folio(folio, vma);
  1489. goto walk_abort;
  1490. }
  1491. if (!pvmw.pte) {
  1492. if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
  1493. folio))
  1494. goto walk_done;
  1495. if (flags & TTU_SPLIT_HUGE_PMD) {
  1496. /*
  1497. * We temporarily have to drop the PTL and
  1498. * restart so we can process the PTE-mapped THP.
  1499. */
  1500. split_huge_pmd_locked(vma, pvmw.address,
  1501. pvmw.pmd, false, folio);
  1502. flags &= ~TTU_SPLIT_HUGE_PMD;
  1503. page_vma_mapped_walk_restart(&pvmw);
  1504. continue;
  1505. }
  1506. }
  1507. /* Unexpected PMD-mapped THP? */
  1508. VM_BUG_ON_FOLIO(!pvmw.pte, folio);
  1509. pfn = pte_pfn(ptep_get(pvmw.pte));
  1510. subpage = folio_page(folio, pfn - folio_pfn(folio));
  1511. address = pvmw.address;
  1512. anon_exclusive = folio_test_anon(folio) &&
  1513. PageAnonExclusive(subpage);
  1514. if (folio_test_hugetlb(folio)) {
  1515. bool anon = folio_test_anon(folio);
  1516. /*
  1517. * The try_to_unmap() is only passed a hugetlb page
  1518. * in the case where the hugetlb page is poisoned.
  1519. */
  1520. VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
  1521. /*
  1522. * huge_pmd_unshare may unmap an entire PMD page.
  1523. * There is no way of knowing exactly which PMDs may
  1524. * be cached for this mm, so we must flush them all.
  1525. * start/end were already adjusted above to cover this
  1526. * range.
  1527. */
  1528. flush_cache_range(vma, range.start, range.end);
  1529. /*
  1530. * To call huge_pmd_unshare, i_mmap_rwsem must be
  1531. * held in write mode. Caller needs to explicitly
  1532. * do this outside rmap routines.
  1533. *
  1534. * We also must hold hugetlb vma_lock in write mode.
  1535. * Lock order dictates acquiring vma_lock BEFORE
  1536. * i_mmap_rwsem. We can only try lock here and fail
  1537. * if unsuccessful.
  1538. */
  1539. if (!anon) {
  1540. VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
  1541. if (!hugetlb_vma_trylock_write(vma))
  1542. goto walk_abort;
  1543. if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
  1544. hugetlb_vma_unlock_write(vma);
  1545. flush_tlb_range(vma,
  1546. range.start, range.end);
  1547. /*
  1548. * The ref count of the PMD page was
  1549. * dropped which is part of the way map
  1550. * counting is done for shared PMDs.
  1551. * Return 'true' here. When there is
  1552. * no other sharing, huge_pmd_unshare
  1553. * returns false and we will unmap the
  1554. * actual page and drop map count
  1555. * to zero.
  1556. */
  1557. goto walk_done;
  1558. }
  1559. hugetlb_vma_unlock_write(vma);
  1560. }
  1561. pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
  1562. } else {
  1563. flush_cache_page(vma, address, pfn);
  1564. /* Nuke the page table entry. */
  1565. if (should_defer_flush(mm, flags)) {
  1566. /*
  1567. * We clear the PTE but do not flush so potentially
  1568. * a remote CPU could still be writing to the folio.
  1569. * If the entry was previously clean then the
  1570. * architecture must guarantee that a clear->dirty
  1571. * transition on a cached TLB entry is written through
  1572. * and traps if the PTE is unmapped.
  1573. */
  1574. pteval = ptep_get_and_clear(mm, address, pvmw.pte);
  1575. set_tlb_ubc_flush_pending(mm, pteval, address);
  1576. } else {
  1577. pteval = ptep_clear_flush(vma, address, pvmw.pte);
  1578. }
  1579. }
  1580. /*
  1581. * Now the pte is cleared. If this pte was uffd-wp armed,
  1582. * we may want to replace a none pte with a marker pte if
  1583. * it's file-backed, so we don't lose the tracking info.
  1584. */
  1585. pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
  1586. /* Set the dirty flag on the folio now the pte is gone. */
  1587. if (pte_dirty(pteval))
  1588. folio_mark_dirty(folio);
  1589. /* Update high watermark before we lower rss */
  1590. update_hiwater_rss(mm);
  1591. if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
  1592. pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
  1593. if (folio_test_hugetlb(folio)) {
  1594. hugetlb_count_sub(folio_nr_pages(folio), mm);
  1595. set_huge_pte_at(mm, address, pvmw.pte, pteval,
  1596. hsz);
  1597. } else {
  1598. dec_mm_counter(mm, mm_counter(folio));
  1599. set_pte_at(mm, address, pvmw.pte, pteval);
  1600. }
  1601. } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
  1602. /*
  1603. * The guest indicated that the page content is of no
  1604. * interest anymore. Simply discard the pte, vmscan
  1605. * will take care of the rest.
  1606. * A future reference will then fault in a new zero
  1607. * page. When userfaultfd is active, we must not drop
  1608. * this page though, as its main user (postcopy
  1609. * migration) will not expect userfaults on already
  1610. * copied pages.
  1611. */
  1612. dec_mm_counter(mm, mm_counter(folio));
  1613. } else if (folio_test_anon(folio)) {
  1614. swp_entry_t entry = page_swap_entry(subpage);
  1615. pte_t swp_pte;
  1616. /*
  1617. * Store the swap location in the pte.
  1618. * See handle_pte_fault() ...
  1619. */
  1620. if (unlikely(folio_test_swapbacked(folio) !=
  1621. folio_test_swapcache(folio))) {
  1622. WARN_ON_ONCE(1);
  1623. goto walk_abort;
  1624. }
  1625. /* MADV_FREE page check */
  1626. if (!folio_test_swapbacked(folio)) {
  1627. int ref_count, map_count;
  1628. /*
  1629. * Synchronize with gup_pte_range():
  1630. * - clear PTE; barrier; read refcount
  1631. * - inc refcount; barrier; read PTE
  1632. */
  1633. smp_mb();
  1634. ref_count = folio_ref_count(folio);
  1635. map_count = folio_mapcount(folio);
  1636. /*
  1637. * Order reads for page refcount and dirty flag
  1638. * (see comments in __remove_mapping()).
  1639. */
  1640. smp_rmb();
  1641. /*
  1642. * The only page refs must be one from isolation
  1643. * plus the rmap(s) (dropped by discard:).
  1644. */
  1645. if (ref_count == 1 + map_count &&
  1646. (!folio_test_dirty(folio) ||
  1647. /*
  1648. * Unlike MADV_FREE mappings, VM_DROPPABLE
  1649. * ones can be dropped even if they've
  1650. * been dirtied.
  1651. */
  1652. (vma->vm_flags & VM_DROPPABLE))) {
  1653. dec_mm_counter(mm, MM_ANONPAGES);
  1654. goto discard;
  1655. }
  1656. /*
  1657. * If the folio was redirtied, it cannot be
  1658. * discarded. Remap the page to page table.
  1659. */
  1660. set_pte_at(mm, address, pvmw.pte, pteval);
  1661. /*
  1662. * Unlike MADV_FREE mappings, VM_DROPPABLE ones
  1663. * never get swap backed on failure to drop.
  1664. */
  1665. if (!(vma->vm_flags & VM_DROPPABLE))
  1666. folio_set_swapbacked(folio);
  1667. goto walk_abort;
  1668. }
  1669. if (swap_duplicate(entry) < 0) {
  1670. set_pte_at(mm, address, pvmw.pte, pteval);
  1671. goto walk_abort;
  1672. }
  1673. if (arch_unmap_one(mm, vma, address, pteval) < 0) {
  1674. swap_free(entry);
  1675. set_pte_at(mm, address, pvmw.pte, pteval);
  1676. goto walk_abort;
  1677. }
  1678. /* See folio_try_share_anon_rmap(): clear PTE first. */
  1679. if (anon_exclusive &&
  1680. folio_try_share_anon_rmap_pte(folio, subpage)) {
  1681. swap_free(entry);
  1682. set_pte_at(mm, address, pvmw.pte, pteval);
  1683. goto walk_abort;
  1684. }
  1685. if (list_empty(&mm->mmlist)) {
  1686. spin_lock(&mmlist_lock);
  1687. if (list_empty(&mm->mmlist))
  1688. list_add(&mm->mmlist, &init_mm.mmlist);
  1689. spin_unlock(&mmlist_lock);
  1690. }
  1691. dec_mm_counter(mm, MM_ANONPAGES);
  1692. inc_mm_counter(mm, MM_SWAPENTS);
  1693. swp_pte = swp_entry_to_pte(entry);
  1694. if (anon_exclusive)
  1695. swp_pte = pte_swp_mkexclusive(swp_pte);
  1696. if (pte_soft_dirty(pteval))
  1697. swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1698. if (pte_uffd_wp(pteval))
  1699. swp_pte = pte_swp_mkuffd_wp(swp_pte);
  1700. set_pte_at(mm, address, pvmw.pte, swp_pte);
  1701. } else {
  1702. /*
  1703. * This is a locked file-backed folio,
  1704. * so it cannot be removed from the page
  1705. * cache and replaced by a new folio before
  1706. * mmu_notifier_invalidate_range_end, so no
  1707. * concurrent thread might update its page table
  1708. * to point at a new folio while a device is
  1709. * still using this folio.
  1710. *
  1711. * See Documentation/mm/mmu_notifier.rst
  1712. */
  1713. dec_mm_counter(mm, mm_counter_file(folio));
  1714. }
  1715. discard:
  1716. if (unlikely(folio_test_hugetlb(folio)))
  1717. hugetlb_remove_rmap(folio);
  1718. else
  1719. folio_remove_rmap_pte(folio, subpage, vma);
  1720. if (vma->vm_flags & VM_LOCKED)
  1721. mlock_drain_local();
  1722. folio_put(folio);
  1723. continue;
  1724. walk_abort:
  1725. ret = false;
  1726. walk_done:
  1727. page_vma_mapped_walk_done(&pvmw);
  1728. break;
  1729. }
  1730. mmu_notifier_invalidate_range_end(&range);
  1731. return ret;
  1732. }
  1733. static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
  1734. {
  1735. return vma_is_temporary_stack(vma);
  1736. }
  1737. static int folio_not_mapped(struct folio *folio)
  1738. {
  1739. return !folio_mapped(folio);
  1740. }
  1741. /**
  1742. * try_to_unmap - Try to remove all page table mappings to a folio.
  1743. * @folio: The folio to unmap.
  1744. * @flags: action and flags
  1745. *
  1746. * Tries to remove all the page table entries which are mapping this
  1747. * folio. It is the caller's responsibility to check if the folio is
  1748. * still mapped if needed (use TTU_SYNC to prevent accounting races).
  1749. *
  1750. * Context: Caller must hold the folio lock.
  1751. */
  1752. void try_to_unmap(struct folio *folio, enum ttu_flags flags)
  1753. {
  1754. struct rmap_walk_control rwc = {
  1755. .rmap_one = try_to_unmap_one,
  1756. .arg = (void *)flags,
  1757. .done = folio_not_mapped,
  1758. .anon_lock = folio_lock_anon_vma_read,
  1759. };
  1760. if (flags & TTU_RMAP_LOCKED)
  1761. rmap_walk_locked(folio, &rwc);
  1762. else
  1763. rmap_walk(folio, &rwc);
  1764. }
  1765. /*
  1766. * @arg: enum ttu_flags will be passed to this argument.
  1767. *
  1768. * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
  1769. * containing migration entries.
  1770. */
  1771. static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
  1772. unsigned long address, void *arg)
  1773. {
  1774. struct mm_struct *mm = vma->vm_mm;
  1775. DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
  1776. pte_t pteval;
  1777. struct page *subpage;
  1778. bool anon_exclusive, ret = true;
  1779. struct mmu_notifier_range range;
  1780. enum ttu_flags flags = (enum ttu_flags)(long)arg;
  1781. unsigned long pfn;
  1782. unsigned long hsz = 0;
  1783. /*
  1784. * When racing against e.g. zap_pte_range() on another cpu,
  1785. * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
  1786. * try_to_migrate() may return before page_mapped() has become false,
  1787. * if page table locking is skipped: use TTU_SYNC to wait for that.
  1788. */
  1789. if (flags & TTU_SYNC)
  1790. pvmw.flags = PVMW_SYNC;
  1791. /*
  1792. * unmap_page() in mm/huge_memory.c is the only user of migration with
  1793. * TTU_SPLIT_HUGE_PMD and it wants to freeze.
  1794. */
  1795. if (flags & TTU_SPLIT_HUGE_PMD)
  1796. split_huge_pmd_address(vma, address, true, folio);
  1797. /*
  1798. * For THP, we have to assume the worse case ie pmd for invalidation.
  1799. * For hugetlb, it could be much worse if we need to do pud
  1800. * invalidation in the case of pmd sharing.
  1801. *
  1802. * Note that the page can not be free in this function as call of
  1803. * try_to_unmap() must hold a reference on the page.
  1804. */
  1805. range.end = vma_address_end(&pvmw);
  1806. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  1807. address, range.end);
  1808. if (folio_test_hugetlb(folio)) {
  1809. /*
  1810. * If sharing is possible, start and end will be adjusted
  1811. * accordingly.
  1812. */
  1813. adjust_range_if_pmd_sharing_possible(vma, &range.start,
  1814. &range.end);
  1815. /* We need the huge page size for set_huge_pte_at() */
  1816. hsz = huge_page_size(hstate_vma(vma));
  1817. }
  1818. mmu_notifier_invalidate_range_start(&range);
  1819. while (page_vma_mapped_walk(&pvmw)) {
  1820. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  1821. /* PMD-mapped THP migration entry */
  1822. if (!pvmw.pte) {
  1823. subpage = folio_page(folio,
  1824. pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
  1825. VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
  1826. !folio_test_pmd_mappable(folio), folio);
  1827. if (set_pmd_migration_entry(&pvmw, subpage)) {
  1828. ret = false;
  1829. page_vma_mapped_walk_done(&pvmw);
  1830. break;
  1831. }
  1832. continue;
  1833. }
  1834. #endif
  1835. /* Unexpected PMD-mapped THP? */
  1836. VM_BUG_ON_FOLIO(!pvmw.pte, folio);
  1837. pfn = pte_pfn(ptep_get(pvmw.pte));
  1838. if (folio_is_zone_device(folio)) {
  1839. /*
  1840. * Our PTE is a non-present device exclusive entry and
  1841. * calculating the subpage as for the common case would
  1842. * result in an invalid pointer.
  1843. *
  1844. * Since only PAGE_SIZE pages can currently be
  1845. * migrated, just set it to page. This will need to be
  1846. * changed when hugepage migrations to device private
  1847. * memory are supported.
  1848. */
  1849. VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
  1850. subpage = &folio->page;
  1851. } else {
  1852. subpage = folio_page(folio, pfn - folio_pfn(folio));
  1853. }
  1854. address = pvmw.address;
  1855. anon_exclusive = folio_test_anon(folio) &&
  1856. PageAnonExclusive(subpage);
  1857. if (folio_test_hugetlb(folio)) {
  1858. bool anon = folio_test_anon(folio);
  1859. /*
  1860. * huge_pmd_unshare may unmap an entire PMD page.
  1861. * There is no way of knowing exactly which PMDs may
  1862. * be cached for this mm, so we must flush them all.
  1863. * start/end were already adjusted above to cover this
  1864. * range.
  1865. */
  1866. flush_cache_range(vma, range.start, range.end);
  1867. /*
  1868. * To call huge_pmd_unshare, i_mmap_rwsem must be
  1869. * held in write mode. Caller needs to explicitly
  1870. * do this outside rmap routines.
  1871. *
  1872. * We also must hold hugetlb vma_lock in write mode.
  1873. * Lock order dictates acquiring vma_lock BEFORE
  1874. * i_mmap_rwsem. We can only try lock here and
  1875. * fail if unsuccessful.
  1876. */
  1877. if (!anon) {
  1878. VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
  1879. if (!hugetlb_vma_trylock_write(vma)) {
  1880. page_vma_mapped_walk_done(&pvmw);
  1881. ret = false;
  1882. break;
  1883. }
  1884. if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
  1885. hugetlb_vma_unlock_write(vma);
  1886. flush_tlb_range(vma,
  1887. range.start, range.end);
  1888. /*
  1889. * The ref count of the PMD page was
  1890. * dropped which is part of the way map
  1891. * counting is done for shared PMDs.
  1892. * Return 'true' here. When there is
  1893. * no other sharing, huge_pmd_unshare
  1894. * returns false and we will unmap the
  1895. * actual page and drop map count
  1896. * to zero.
  1897. */
  1898. page_vma_mapped_walk_done(&pvmw);
  1899. break;
  1900. }
  1901. hugetlb_vma_unlock_write(vma);
  1902. }
  1903. /* Nuke the hugetlb page table entry */
  1904. pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
  1905. } else {
  1906. flush_cache_page(vma, address, pfn);
  1907. /* Nuke the page table entry. */
  1908. if (should_defer_flush(mm, flags)) {
  1909. /*
  1910. * We clear the PTE but do not flush so potentially
  1911. * a remote CPU could still be writing to the folio.
  1912. * If the entry was previously clean then the
  1913. * architecture must guarantee that a clear->dirty
  1914. * transition on a cached TLB entry is written through
  1915. * and traps if the PTE is unmapped.
  1916. */
  1917. pteval = ptep_get_and_clear(mm, address, pvmw.pte);
  1918. set_tlb_ubc_flush_pending(mm, pteval, address);
  1919. } else {
  1920. pteval = ptep_clear_flush(vma, address, pvmw.pte);
  1921. }
  1922. }
  1923. /* Set the dirty flag on the folio now the pte is gone. */
  1924. if (pte_dirty(pteval))
  1925. folio_mark_dirty(folio);
  1926. /* Update high watermark before we lower rss */
  1927. update_hiwater_rss(mm);
  1928. if (folio_is_device_private(folio)) {
  1929. unsigned long pfn = folio_pfn(folio);
  1930. swp_entry_t entry;
  1931. pte_t swp_pte;
  1932. if (anon_exclusive)
  1933. WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
  1934. subpage));
  1935. /*
  1936. * Store the pfn of the page in a special migration
  1937. * pte. do_swap_page() will wait until the migration
  1938. * pte is removed and then restart fault handling.
  1939. */
  1940. entry = pte_to_swp_entry(pteval);
  1941. if (is_writable_device_private_entry(entry))
  1942. entry = make_writable_migration_entry(pfn);
  1943. else if (anon_exclusive)
  1944. entry = make_readable_exclusive_migration_entry(pfn);
  1945. else
  1946. entry = make_readable_migration_entry(pfn);
  1947. swp_pte = swp_entry_to_pte(entry);
  1948. /*
  1949. * pteval maps a zone device page and is therefore
  1950. * a swap pte.
  1951. */
  1952. if (pte_swp_soft_dirty(pteval))
  1953. swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1954. if (pte_swp_uffd_wp(pteval))
  1955. swp_pte = pte_swp_mkuffd_wp(swp_pte);
  1956. set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
  1957. trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
  1958. folio_order(folio));
  1959. /*
  1960. * No need to invalidate here it will synchronize on
  1961. * against the special swap migration pte.
  1962. */
  1963. } else if (PageHWPoison(subpage)) {
  1964. pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
  1965. if (folio_test_hugetlb(folio)) {
  1966. hugetlb_count_sub(folio_nr_pages(folio), mm);
  1967. set_huge_pte_at(mm, address, pvmw.pte, pteval,
  1968. hsz);
  1969. } else {
  1970. dec_mm_counter(mm, mm_counter(folio));
  1971. set_pte_at(mm, address, pvmw.pte, pteval);
  1972. }
  1973. } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
  1974. /*
  1975. * The guest indicated that the page content is of no
  1976. * interest anymore. Simply discard the pte, vmscan
  1977. * will take care of the rest.
  1978. * A future reference will then fault in a new zero
  1979. * page. When userfaultfd is active, we must not drop
  1980. * this page though, as its main user (postcopy
  1981. * migration) will not expect userfaults on already
  1982. * copied pages.
  1983. */
  1984. dec_mm_counter(mm, mm_counter(folio));
  1985. } else {
  1986. swp_entry_t entry;
  1987. pte_t swp_pte;
  1988. if (arch_unmap_one(mm, vma, address, pteval) < 0) {
  1989. if (folio_test_hugetlb(folio))
  1990. set_huge_pte_at(mm, address, pvmw.pte,
  1991. pteval, hsz);
  1992. else
  1993. set_pte_at(mm, address, pvmw.pte, pteval);
  1994. ret = false;
  1995. page_vma_mapped_walk_done(&pvmw);
  1996. break;
  1997. }
  1998. VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
  1999. !anon_exclusive, subpage);
  2000. /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
  2001. if (folio_test_hugetlb(folio)) {
  2002. if (anon_exclusive &&
  2003. hugetlb_try_share_anon_rmap(folio)) {
  2004. set_huge_pte_at(mm, address, pvmw.pte,
  2005. pteval, hsz);
  2006. ret = false;
  2007. page_vma_mapped_walk_done(&pvmw);
  2008. break;
  2009. }
  2010. } else if (anon_exclusive &&
  2011. folio_try_share_anon_rmap_pte(folio, subpage)) {
  2012. set_pte_at(mm, address, pvmw.pte, pteval);
  2013. ret = false;
  2014. page_vma_mapped_walk_done(&pvmw);
  2015. break;
  2016. }
  2017. /*
  2018. * Store the pfn of the page in a special migration
  2019. * pte. do_swap_page() will wait until the migration
  2020. * pte is removed and then restart fault handling.
  2021. */
  2022. if (pte_write(pteval))
  2023. entry = make_writable_migration_entry(
  2024. page_to_pfn(subpage));
  2025. else if (anon_exclusive)
  2026. entry = make_readable_exclusive_migration_entry(
  2027. page_to_pfn(subpage));
  2028. else
  2029. entry = make_readable_migration_entry(
  2030. page_to_pfn(subpage));
  2031. if (pte_young(pteval))
  2032. entry = make_migration_entry_young(entry);
  2033. if (pte_dirty(pteval))
  2034. entry = make_migration_entry_dirty(entry);
  2035. swp_pte = swp_entry_to_pte(entry);
  2036. if (pte_soft_dirty(pteval))
  2037. swp_pte = pte_swp_mksoft_dirty(swp_pte);
  2038. if (pte_uffd_wp(pteval))
  2039. swp_pte = pte_swp_mkuffd_wp(swp_pte);
  2040. if (folio_test_hugetlb(folio))
  2041. set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
  2042. hsz);
  2043. else
  2044. set_pte_at(mm, address, pvmw.pte, swp_pte);
  2045. trace_set_migration_pte(address, pte_val(swp_pte),
  2046. folio_order(folio));
  2047. /*
  2048. * No need to invalidate here it will synchronize on
  2049. * against the special swap migration pte.
  2050. */
  2051. }
  2052. if (unlikely(folio_test_hugetlb(folio)))
  2053. hugetlb_remove_rmap(folio);
  2054. else
  2055. folio_remove_rmap_pte(folio, subpage, vma);
  2056. if (vma->vm_flags & VM_LOCKED)
  2057. mlock_drain_local();
  2058. folio_put(folio);
  2059. }
  2060. mmu_notifier_invalidate_range_end(&range);
  2061. return ret;
  2062. }
  2063. /**
  2064. * try_to_migrate - try to replace all page table mappings with swap entries
  2065. * @folio: the folio to replace page table entries for
  2066. * @flags: action and flags
  2067. *
  2068. * Tries to remove all the page table entries which are mapping this folio and
  2069. * replace them with special swap entries. Caller must hold the folio lock.
  2070. */
  2071. void try_to_migrate(struct folio *folio, enum ttu_flags flags)
  2072. {
  2073. struct rmap_walk_control rwc = {
  2074. .rmap_one = try_to_migrate_one,
  2075. .arg = (void *)flags,
  2076. .done = folio_not_mapped,
  2077. .anon_lock = folio_lock_anon_vma_read,
  2078. };
  2079. /*
  2080. * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
  2081. * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
  2082. */
  2083. if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
  2084. TTU_SYNC | TTU_BATCH_FLUSH)))
  2085. return;
  2086. if (folio_is_zone_device(folio) &&
  2087. (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
  2088. return;
  2089. /*
  2090. * During exec, a temporary VMA is setup and later moved.
  2091. * The VMA is moved under the anon_vma lock but not the
  2092. * page tables leading to a race where migration cannot
  2093. * find the migration ptes. Rather than increasing the
  2094. * locking requirements of exec(), migration skips
  2095. * temporary VMAs until after exec() completes.
  2096. */
  2097. if (!folio_test_ksm(folio) && folio_test_anon(folio))
  2098. rwc.invalid_vma = invalid_migration_vma;
  2099. if (flags & TTU_RMAP_LOCKED)
  2100. rmap_walk_locked(folio, &rwc);
  2101. else
  2102. rmap_walk(folio, &rwc);
  2103. }
  2104. #ifdef CONFIG_DEVICE_PRIVATE
  2105. struct make_exclusive_args {
  2106. struct mm_struct *mm;
  2107. unsigned long address;
  2108. void *owner;
  2109. bool valid;
  2110. };
  2111. static bool page_make_device_exclusive_one(struct folio *folio,
  2112. struct vm_area_struct *vma, unsigned long address, void *priv)
  2113. {
  2114. struct mm_struct *mm = vma->vm_mm;
  2115. DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
  2116. struct make_exclusive_args *args = priv;
  2117. pte_t pteval;
  2118. struct page *subpage;
  2119. bool ret = true;
  2120. struct mmu_notifier_range range;
  2121. swp_entry_t entry;
  2122. pte_t swp_pte;
  2123. pte_t ptent;
  2124. mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
  2125. vma->vm_mm, address, min(vma->vm_end,
  2126. address + folio_size(folio)),
  2127. args->owner);
  2128. mmu_notifier_invalidate_range_start(&range);
  2129. while (page_vma_mapped_walk(&pvmw)) {
  2130. /* Unexpected PMD-mapped THP? */
  2131. VM_BUG_ON_FOLIO(!pvmw.pte, folio);
  2132. ptent = ptep_get(pvmw.pte);
  2133. if (!pte_present(ptent)) {
  2134. ret = false;
  2135. page_vma_mapped_walk_done(&pvmw);
  2136. break;
  2137. }
  2138. subpage = folio_page(folio,
  2139. pte_pfn(ptent) - folio_pfn(folio));
  2140. address = pvmw.address;
  2141. /* Nuke the page table entry. */
  2142. flush_cache_page(vma, address, pte_pfn(ptent));
  2143. pteval = ptep_clear_flush(vma, address, pvmw.pte);
  2144. /* Set the dirty flag on the folio now the pte is gone. */
  2145. if (pte_dirty(pteval))
  2146. folio_mark_dirty(folio);
  2147. /*
  2148. * Check that our target page is still mapped at the expected
  2149. * address.
  2150. */
  2151. if (args->mm == mm && args->address == address &&
  2152. pte_write(pteval))
  2153. args->valid = true;
  2154. /*
  2155. * Store the pfn of the page in a special migration
  2156. * pte. do_swap_page() will wait until the migration
  2157. * pte is removed and then restart fault handling.
  2158. */
  2159. if (pte_write(pteval))
  2160. entry = make_writable_device_exclusive_entry(
  2161. page_to_pfn(subpage));
  2162. else
  2163. entry = make_readable_device_exclusive_entry(
  2164. page_to_pfn(subpage));
  2165. swp_pte = swp_entry_to_pte(entry);
  2166. if (pte_soft_dirty(pteval))
  2167. swp_pte = pte_swp_mksoft_dirty(swp_pte);
  2168. if (pte_uffd_wp(pteval))
  2169. swp_pte = pte_swp_mkuffd_wp(swp_pte);
  2170. set_pte_at(mm, address, pvmw.pte, swp_pte);
  2171. /*
  2172. * There is a reference on the page for the swap entry which has
  2173. * been removed, so shouldn't take another.
  2174. */
  2175. folio_remove_rmap_pte(folio, subpage, vma);
  2176. }
  2177. mmu_notifier_invalidate_range_end(&range);
  2178. return ret;
  2179. }
  2180. /**
  2181. * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
  2182. * @folio: The folio to replace page table entries for.
  2183. * @mm: The mm_struct where the folio is expected to be mapped.
  2184. * @address: Address where the folio is expected to be mapped.
  2185. * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
  2186. *
  2187. * Tries to remove all the page table entries which are mapping this
  2188. * folio and replace them with special device exclusive swap entries to
  2189. * grant a device exclusive access to the folio.
  2190. *
  2191. * Context: Caller must hold the folio lock.
  2192. * Return: false if the page is still mapped, or if it could not be unmapped
  2193. * from the expected address. Otherwise returns true (success).
  2194. */
  2195. static bool folio_make_device_exclusive(struct folio *folio,
  2196. struct mm_struct *mm, unsigned long address, void *owner)
  2197. {
  2198. struct make_exclusive_args args = {
  2199. .mm = mm,
  2200. .address = address,
  2201. .owner = owner,
  2202. .valid = false,
  2203. };
  2204. struct rmap_walk_control rwc = {
  2205. .rmap_one = page_make_device_exclusive_one,
  2206. .done = folio_not_mapped,
  2207. .anon_lock = folio_lock_anon_vma_read,
  2208. .arg = &args,
  2209. };
  2210. /*
  2211. * Restrict to anonymous folios for now to avoid potential writeback
  2212. * issues.
  2213. */
  2214. if (!folio_test_anon(folio))
  2215. return false;
  2216. rmap_walk(folio, &rwc);
  2217. return args.valid && !folio_mapcount(folio);
  2218. }
  2219. /**
  2220. * make_device_exclusive_range() - Mark a range for exclusive use by a device
  2221. * @mm: mm_struct of associated target process
  2222. * @start: start of the region to mark for exclusive device access
  2223. * @end: end address of region
  2224. * @pages: returns the pages which were successfully marked for exclusive access
  2225. * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
  2226. *
  2227. * Returns: number of pages found in the range by GUP. A page is marked for
  2228. * exclusive access only if the page pointer is non-NULL.
  2229. *
  2230. * This function finds ptes mapping page(s) to the given address range, locks
  2231. * them and replaces mappings with special swap entries preventing userspace CPU
  2232. * access. On fault these entries are replaced with the original mapping after
  2233. * calling MMU notifiers.
  2234. *
  2235. * A driver using this to program access from a device must use a mmu notifier
  2236. * critical section to hold a device specific lock during programming. Once
  2237. * programming is complete it should drop the page lock and reference after
  2238. * which point CPU access to the page will revoke the exclusive access.
  2239. */
  2240. int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
  2241. unsigned long end, struct page **pages,
  2242. void *owner)
  2243. {
  2244. long npages = (end - start) >> PAGE_SHIFT;
  2245. long i;
  2246. npages = get_user_pages_remote(mm, start, npages,
  2247. FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
  2248. pages, NULL);
  2249. if (npages < 0)
  2250. return npages;
  2251. for (i = 0; i < npages; i++, start += PAGE_SIZE) {
  2252. struct folio *folio = page_folio(pages[i]);
  2253. if (PageTail(pages[i]) || !folio_trylock(folio)) {
  2254. folio_put(folio);
  2255. pages[i] = NULL;
  2256. continue;
  2257. }
  2258. if (!folio_make_device_exclusive(folio, mm, start, owner)) {
  2259. folio_unlock(folio);
  2260. folio_put(folio);
  2261. pages[i] = NULL;
  2262. }
  2263. }
  2264. return npages;
  2265. }
  2266. EXPORT_SYMBOL_GPL(make_device_exclusive_range);
  2267. #endif
  2268. void __put_anon_vma(struct anon_vma *anon_vma)
  2269. {
  2270. struct anon_vma *root = anon_vma->root;
  2271. anon_vma_free(anon_vma);
  2272. if (root != anon_vma && atomic_dec_and_test(&root->refcount))
  2273. anon_vma_free(root);
  2274. }
  2275. static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
  2276. struct rmap_walk_control *rwc)
  2277. {
  2278. struct anon_vma *anon_vma;
  2279. if (rwc->anon_lock)
  2280. return rwc->anon_lock(folio, rwc);
  2281. /*
  2282. * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
  2283. * because that depends on page_mapped(); but not all its usages
  2284. * are holding mmap_lock. Users without mmap_lock are required to
  2285. * take a reference count to prevent the anon_vma disappearing
  2286. */
  2287. anon_vma = folio_anon_vma(folio);
  2288. if (!anon_vma)
  2289. return NULL;
  2290. if (anon_vma_trylock_read(anon_vma))
  2291. goto out;
  2292. if (rwc->try_lock) {
  2293. anon_vma = NULL;
  2294. rwc->contended = true;
  2295. goto out;
  2296. }
  2297. anon_vma_lock_read(anon_vma);
  2298. out:
  2299. return anon_vma;
  2300. }
  2301. /*
  2302. * rmap_walk_anon - do something to anonymous page using the object-based
  2303. * rmap method
  2304. * @folio: the folio to be handled
  2305. * @rwc: control variable according to each walk type
  2306. * @locked: caller holds relevant rmap lock
  2307. *
  2308. * Find all the mappings of a folio using the mapping pointer and the vma
  2309. * chains contained in the anon_vma struct it points to.
  2310. */
  2311. static void rmap_walk_anon(struct folio *folio,
  2312. struct rmap_walk_control *rwc, bool locked)
  2313. {
  2314. struct anon_vma *anon_vma;
  2315. pgoff_t pgoff_start, pgoff_end;
  2316. struct anon_vma_chain *avc;
  2317. if (locked) {
  2318. anon_vma = folio_anon_vma(folio);
  2319. /* anon_vma disappear under us? */
  2320. VM_BUG_ON_FOLIO(!anon_vma, folio);
  2321. } else {
  2322. anon_vma = rmap_walk_anon_lock(folio, rwc);
  2323. }
  2324. if (!anon_vma)
  2325. return;
  2326. pgoff_start = folio_pgoff(folio);
  2327. pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
  2328. anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
  2329. pgoff_start, pgoff_end) {
  2330. struct vm_area_struct *vma = avc->vma;
  2331. unsigned long address = vma_address(vma, pgoff_start,
  2332. folio_nr_pages(folio));
  2333. VM_BUG_ON_VMA(address == -EFAULT, vma);
  2334. cond_resched();
  2335. if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
  2336. continue;
  2337. if (!rwc->rmap_one(folio, vma, address, rwc->arg))
  2338. break;
  2339. if (rwc->done && rwc->done(folio))
  2340. break;
  2341. }
  2342. if (!locked)
  2343. anon_vma_unlock_read(anon_vma);
  2344. }
  2345. /*
  2346. * rmap_walk_file - do something to file page using the object-based rmap method
  2347. * @folio: the folio to be handled
  2348. * @rwc: control variable according to each walk type
  2349. * @locked: caller holds relevant rmap lock
  2350. *
  2351. * Find all the mappings of a folio using the mapping pointer and the vma chains
  2352. * contained in the address_space struct it points to.
  2353. */
  2354. static void rmap_walk_file(struct folio *folio,
  2355. struct rmap_walk_control *rwc, bool locked)
  2356. {
  2357. struct address_space *mapping = folio_mapping(folio);
  2358. pgoff_t pgoff_start, pgoff_end;
  2359. struct vm_area_struct *vma;
  2360. /*
  2361. * The page lock not only makes sure that page->mapping cannot
  2362. * suddenly be NULLified by truncation, it makes sure that the
  2363. * structure at mapping cannot be freed and reused yet,
  2364. * so we can safely take mapping->i_mmap_rwsem.
  2365. */
  2366. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  2367. if (!mapping)
  2368. return;
  2369. pgoff_start = folio_pgoff(folio);
  2370. pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
  2371. if (!locked) {
  2372. if (i_mmap_trylock_read(mapping))
  2373. goto lookup;
  2374. if (rwc->try_lock) {
  2375. rwc->contended = true;
  2376. return;
  2377. }
  2378. i_mmap_lock_read(mapping);
  2379. }
  2380. lookup:
  2381. vma_interval_tree_foreach(vma, &mapping->i_mmap,
  2382. pgoff_start, pgoff_end) {
  2383. unsigned long address = vma_address(vma, pgoff_start,
  2384. folio_nr_pages(folio));
  2385. VM_BUG_ON_VMA(address == -EFAULT, vma);
  2386. cond_resched();
  2387. if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
  2388. continue;
  2389. if (!rwc->rmap_one(folio, vma, address, rwc->arg))
  2390. goto done;
  2391. if (rwc->done && rwc->done(folio))
  2392. goto done;
  2393. }
  2394. done:
  2395. if (!locked)
  2396. i_mmap_unlock_read(mapping);
  2397. }
  2398. void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
  2399. {
  2400. if (unlikely(folio_test_ksm(folio)))
  2401. rmap_walk_ksm(folio, rwc);
  2402. else if (folio_test_anon(folio))
  2403. rmap_walk_anon(folio, rwc, false);
  2404. else
  2405. rmap_walk_file(folio, rwc, false);
  2406. }
  2407. /* Like rmap_walk, but caller holds relevant rmap lock */
  2408. void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
  2409. {
  2410. /* no ksm support for now */
  2411. VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
  2412. if (folio_test_anon(folio))
  2413. rmap_walk_anon(folio, rwc, true);
  2414. else
  2415. rmap_walk_file(folio, rwc, true);
  2416. }
  2417. #ifdef CONFIG_HUGETLB_PAGE
  2418. /*
  2419. * The following two functions are for anonymous (private mapped) hugepages.
  2420. * Unlike common anonymous pages, anonymous hugepages have no accounting code
  2421. * and no lru code, because we handle hugepages differently from common pages.
  2422. */
  2423. void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
  2424. unsigned long address, rmap_t flags)
  2425. {
  2426. VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
  2427. VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
  2428. atomic_inc(&folio->_entire_mapcount);
  2429. atomic_inc(&folio->_large_mapcount);
  2430. if (flags & RMAP_EXCLUSIVE)
  2431. SetPageAnonExclusive(&folio->page);
  2432. VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
  2433. PageAnonExclusive(&folio->page), folio);
  2434. }
  2435. void hugetlb_add_new_anon_rmap(struct folio *folio,
  2436. struct vm_area_struct *vma, unsigned long address)
  2437. {
  2438. VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
  2439. BUG_ON(address < vma->vm_start || address >= vma->vm_end);
  2440. /* increment count (starts at -1) */
  2441. atomic_set(&folio->_entire_mapcount, 0);
  2442. atomic_set(&folio->_large_mapcount, 0);
  2443. folio_clear_hugetlb_restore_reserve(folio);
  2444. __folio_set_anon(folio, vma, address, true);
  2445. SetPageAnonExclusive(&folio->page);
  2446. }
  2447. #endif /* CONFIG_HUGETLB_PAGE */