huge_memory.c 115 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2009 Red Hat, Inc.
  4. */
  5. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  6. #include <linux/mm.h>
  7. #include <linux/sched.h>
  8. #include <linux/sched/mm.h>
  9. #include <linux/sched/coredump.h>
  10. #include <linux/sched/numa_balancing.h>
  11. #include <linux/highmem.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/mmu_notifier.h>
  14. #include <linux/rmap.h>
  15. #include <linux/swap.h>
  16. #include <linux/shrinker.h>
  17. #include <linux/mm_inline.h>
  18. #include <linux/swapops.h>
  19. #include <linux/backing-dev.h>
  20. #include <linux/dax.h>
  21. #include <linux/mm_types.h>
  22. #include <linux/khugepaged.h>
  23. #include <linux/freezer.h>
  24. #include <linux/pfn_t.h>
  25. #include <linux/mman.h>
  26. #include <linux/memremap.h>
  27. #include <linux/pagemap.h>
  28. #include <linux/debugfs.h>
  29. #include <linux/migrate.h>
  30. #include <linux/hashtable.h>
  31. #include <linux/userfaultfd_k.h>
  32. #include <linux/page_idle.h>
  33. #include <linux/shmem_fs.h>
  34. #include <linux/oom.h>
  35. #include <linux/numa.h>
  36. #include <linux/page_owner.h>
  37. #include <linux/sched/sysctl.h>
  38. #include <linux/memory-tiers.h>
  39. #include <linux/compat.h>
  40. #include <linux/pgalloc_tag.h>
  41. #include <linux/pagewalk.h>
  42. #include <asm/tlb.h>
  43. #include <asm/pgalloc.h>
  44. #include "internal.h"
  45. #include "swap.h"
  46. #define CREATE_TRACE_POINTS
  47. #include <trace/events/thp.h>
  48. /*
  49. * By default, transparent hugepage support is disabled in order to avoid
  50. * risking an increased memory footprint for applications that are not
  51. * guaranteed to benefit from it. When transparent hugepage support is
  52. * enabled, it is for all mappings, and khugepaged scans all mappings.
  53. * Defrag is invoked by khugepaged hugepage allocations and by page faults
  54. * for all hugepage allocations.
  55. */
  56. unsigned long transparent_hugepage_flags __read_mostly =
  57. #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  58. (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  59. #endif
  60. #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  61. (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  62. #endif
  63. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
  64. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  65. (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  66. static struct shrinker *deferred_split_shrinker;
  67. static unsigned long deferred_split_count(struct shrinker *shrink,
  68. struct shrink_control *sc);
  69. static unsigned long deferred_split_scan(struct shrinker *shrink,
  70. struct shrink_control *sc);
  71. static bool split_underused_thp = true;
  72. static atomic_t huge_zero_refcount;
  73. struct folio *huge_zero_folio __read_mostly;
  74. unsigned long huge_zero_pfn __read_mostly = ~0UL;
  75. unsigned long huge_anon_orders_always __read_mostly;
  76. unsigned long huge_anon_orders_madvise __read_mostly;
  77. unsigned long huge_anon_orders_inherit __read_mostly;
  78. static bool anon_orders_configured __initdata;
  79. unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
  80. unsigned long vm_flags,
  81. unsigned long tva_flags,
  82. unsigned long orders)
  83. {
  84. bool smaps = tva_flags & TVA_SMAPS;
  85. bool in_pf = tva_flags & TVA_IN_PF;
  86. bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
  87. unsigned long supported_orders;
  88. /* Check the intersection of requested and supported orders. */
  89. if (vma_is_anonymous(vma))
  90. supported_orders = THP_ORDERS_ALL_ANON;
  91. else if (vma_is_special_huge(vma))
  92. supported_orders = THP_ORDERS_ALL_SPECIAL;
  93. else
  94. supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
  95. orders &= supported_orders;
  96. if (!orders)
  97. return 0;
  98. if (!vma->vm_mm) /* vdso */
  99. return 0;
  100. if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
  101. return 0;
  102. /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
  103. if (vma_is_dax(vma))
  104. return in_pf ? orders : 0;
  105. /*
  106. * khugepaged special VMA and hugetlb VMA.
  107. * Must be checked after dax since some dax mappings may have
  108. * VM_MIXEDMAP set.
  109. */
  110. if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
  111. return 0;
  112. /*
  113. * Check alignment for file vma and size for both file and anon vma by
  114. * filtering out the unsuitable orders.
  115. *
  116. * Skip the check for page fault. Huge fault does the check in fault
  117. * handlers.
  118. */
  119. if (!in_pf) {
  120. int order = highest_order(orders);
  121. unsigned long addr;
  122. while (orders) {
  123. addr = vma->vm_end - (PAGE_SIZE << order);
  124. if (thp_vma_suitable_order(vma, addr, order))
  125. break;
  126. order = next_order(&orders, order);
  127. }
  128. if (!orders)
  129. return 0;
  130. }
  131. /*
  132. * Enabled via shmem mount options or sysfs settings.
  133. * Must be done before hugepage flags check since shmem has its
  134. * own flags.
  135. */
  136. if (!in_pf && shmem_file(vma->vm_file))
  137. return shmem_allowable_huge_orders(file_inode(vma->vm_file),
  138. vma, vma->vm_pgoff, 0,
  139. !enforce_sysfs);
  140. if (!vma_is_anonymous(vma)) {
  141. /*
  142. * Enforce sysfs THP requirements as necessary. Anonymous vmas
  143. * were already handled in thp_vma_allowable_orders().
  144. */
  145. if (enforce_sysfs &&
  146. (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
  147. !hugepage_global_always())))
  148. return 0;
  149. /*
  150. * Trust that ->huge_fault() handlers know what they are doing
  151. * in fault path.
  152. */
  153. if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
  154. return orders;
  155. /* Only regular file is valid in collapse path */
  156. if (((!in_pf || smaps)) && file_thp_enabled(vma))
  157. return orders;
  158. return 0;
  159. }
  160. if (vma_is_temporary_stack(vma))
  161. return 0;
  162. /*
  163. * THPeligible bit of smaps should show 1 for proper VMAs even
  164. * though anon_vma is not initialized yet.
  165. *
  166. * Allow page fault since anon_vma may be not initialized until
  167. * the first page fault.
  168. */
  169. if (!vma->anon_vma)
  170. return (smaps || in_pf) ? orders : 0;
  171. return orders;
  172. }
  173. static bool get_huge_zero_page(void)
  174. {
  175. struct folio *zero_folio;
  176. retry:
  177. if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
  178. return true;
  179. zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
  180. HPAGE_PMD_ORDER);
  181. if (!zero_folio) {
  182. count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
  183. return false;
  184. }
  185. /* Ensure zero folio won't have large_rmappable flag set. */
  186. folio_clear_large_rmappable(zero_folio);
  187. preempt_disable();
  188. if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
  189. preempt_enable();
  190. folio_put(zero_folio);
  191. goto retry;
  192. }
  193. WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
  194. /* We take additional reference here. It will be put back by shrinker */
  195. atomic_set(&huge_zero_refcount, 2);
  196. preempt_enable();
  197. count_vm_event(THP_ZERO_PAGE_ALLOC);
  198. return true;
  199. }
  200. static void put_huge_zero_page(void)
  201. {
  202. /*
  203. * Counter should never go to zero here. Only shrinker can put
  204. * last reference.
  205. */
  206. BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
  207. }
  208. struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
  209. {
  210. if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  211. return READ_ONCE(huge_zero_folio);
  212. if (!get_huge_zero_page())
  213. return NULL;
  214. if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  215. put_huge_zero_page();
  216. return READ_ONCE(huge_zero_folio);
  217. }
  218. void mm_put_huge_zero_folio(struct mm_struct *mm)
  219. {
  220. if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  221. put_huge_zero_page();
  222. }
  223. static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  224. struct shrink_control *sc)
  225. {
  226. /* we can free zero page only if last reference remains */
  227. return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  228. }
  229. static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  230. struct shrink_control *sc)
  231. {
  232. if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
  233. struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
  234. BUG_ON(zero_folio == NULL);
  235. WRITE_ONCE(huge_zero_pfn, ~0UL);
  236. folio_put(zero_folio);
  237. return HPAGE_PMD_NR;
  238. }
  239. return 0;
  240. }
  241. static struct shrinker *huge_zero_page_shrinker;
  242. #ifdef CONFIG_SYSFS
  243. static ssize_t enabled_show(struct kobject *kobj,
  244. struct kobj_attribute *attr, char *buf)
  245. {
  246. const char *output;
  247. if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  248. output = "[always] madvise never";
  249. else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  250. &transparent_hugepage_flags))
  251. output = "always [madvise] never";
  252. else
  253. output = "always madvise [never]";
  254. return sysfs_emit(buf, "%s\n", output);
  255. }
  256. static ssize_t enabled_store(struct kobject *kobj,
  257. struct kobj_attribute *attr,
  258. const char *buf, size_t count)
  259. {
  260. ssize_t ret = count;
  261. if (sysfs_streq(buf, "always")) {
  262. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  263. set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  264. } else if (sysfs_streq(buf, "madvise")) {
  265. clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  266. set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  267. } else if (sysfs_streq(buf, "never")) {
  268. clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  269. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  270. } else
  271. ret = -EINVAL;
  272. if (ret > 0) {
  273. int err = start_stop_khugepaged();
  274. if (err)
  275. ret = err;
  276. }
  277. return ret;
  278. }
  279. static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
  280. ssize_t single_hugepage_flag_show(struct kobject *kobj,
  281. struct kobj_attribute *attr, char *buf,
  282. enum transparent_hugepage_flag flag)
  283. {
  284. return sysfs_emit(buf, "%d\n",
  285. !!test_bit(flag, &transparent_hugepage_flags));
  286. }
  287. ssize_t single_hugepage_flag_store(struct kobject *kobj,
  288. struct kobj_attribute *attr,
  289. const char *buf, size_t count,
  290. enum transparent_hugepage_flag flag)
  291. {
  292. unsigned long value;
  293. int ret;
  294. ret = kstrtoul(buf, 10, &value);
  295. if (ret < 0)
  296. return ret;
  297. if (value > 1)
  298. return -EINVAL;
  299. if (value)
  300. set_bit(flag, &transparent_hugepage_flags);
  301. else
  302. clear_bit(flag, &transparent_hugepage_flags);
  303. return count;
  304. }
  305. static ssize_t defrag_show(struct kobject *kobj,
  306. struct kobj_attribute *attr, char *buf)
  307. {
  308. const char *output;
  309. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
  310. &transparent_hugepage_flags))
  311. output = "[always] defer defer+madvise madvise never";
  312. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
  313. &transparent_hugepage_flags))
  314. output = "always [defer] defer+madvise madvise never";
  315. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
  316. &transparent_hugepage_flags))
  317. output = "always defer [defer+madvise] madvise never";
  318. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
  319. &transparent_hugepage_flags))
  320. output = "always defer defer+madvise [madvise] never";
  321. else
  322. output = "always defer defer+madvise madvise [never]";
  323. return sysfs_emit(buf, "%s\n", output);
  324. }
  325. static ssize_t defrag_store(struct kobject *kobj,
  326. struct kobj_attribute *attr,
  327. const char *buf, size_t count)
  328. {
  329. if (sysfs_streq(buf, "always")) {
  330. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  331. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  332. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  333. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  334. } else if (sysfs_streq(buf, "defer+madvise")) {
  335. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  336. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  337. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  338. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  339. } else if (sysfs_streq(buf, "defer")) {
  340. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  341. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  342. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  343. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  344. } else if (sysfs_streq(buf, "madvise")) {
  345. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  346. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  347. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  348. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  349. } else if (sysfs_streq(buf, "never")) {
  350. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  351. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  352. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  353. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  354. } else
  355. return -EINVAL;
  356. return count;
  357. }
  358. static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
  359. static ssize_t use_zero_page_show(struct kobject *kobj,
  360. struct kobj_attribute *attr, char *buf)
  361. {
  362. return single_hugepage_flag_show(kobj, attr, buf,
  363. TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  364. }
  365. static ssize_t use_zero_page_store(struct kobject *kobj,
  366. struct kobj_attribute *attr, const char *buf, size_t count)
  367. {
  368. return single_hugepage_flag_store(kobj, attr, buf, count,
  369. TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  370. }
  371. static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
  372. static ssize_t hpage_pmd_size_show(struct kobject *kobj,
  373. struct kobj_attribute *attr, char *buf)
  374. {
  375. return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
  376. }
  377. static struct kobj_attribute hpage_pmd_size_attr =
  378. __ATTR_RO(hpage_pmd_size);
  379. static ssize_t split_underused_thp_show(struct kobject *kobj,
  380. struct kobj_attribute *attr, char *buf)
  381. {
  382. return sysfs_emit(buf, "%d\n", split_underused_thp);
  383. }
  384. static ssize_t split_underused_thp_store(struct kobject *kobj,
  385. struct kobj_attribute *attr,
  386. const char *buf, size_t count)
  387. {
  388. int err = kstrtobool(buf, &split_underused_thp);
  389. if (err < 0)
  390. return err;
  391. return count;
  392. }
  393. static struct kobj_attribute split_underused_thp_attr = __ATTR(
  394. shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
  395. static struct attribute *hugepage_attr[] = {
  396. &enabled_attr.attr,
  397. &defrag_attr.attr,
  398. &use_zero_page_attr.attr,
  399. &hpage_pmd_size_attr.attr,
  400. #ifdef CONFIG_SHMEM
  401. &shmem_enabled_attr.attr,
  402. #endif
  403. &split_underused_thp_attr.attr,
  404. NULL,
  405. };
  406. static const struct attribute_group hugepage_attr_group = {
  407. .attrs = hugepage_attr,
  408. };
  409. static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
  410. static void thpsize_release(struct kobject *kobj);
  411. static DEFINE_SPINLOCK(huge_anon_orders_lock);
  412. static LIST_HEAD(thpsize_list);
  413. static ssize_t anon_enabled_show(struct kobject *kobj,
  414. struct kobj_attribute *attr, char *buf)
  415. {
  416. int order = to_thpsize(kobj)->order;
  417. const char *output;
  418. if (test_bit(order, &huge_anon_orders_always))
  419. output = "[always] inherit madvise never";
  420. else if (test_bit(order, &huge_anon_orders_inherit))
  421. output = "always [inherit] madvise never";
  422. else if (test_bit(order, &huge_anon_orders_madvise))
  423. output = "always inherit [madvise] never";
  424. else
  425. output = "always inherit madvise [never]";
  426. return sysfs_emit(buf, "%s\n", output);
  427. }
  428. static ssize_t anon_enabled_store(struct kobject *kobj,
  429. struct kobj_attribute *attr,
  430. const char *buf, size_t count)
  431. {
  432. int order = to_thpsize(kobj)->order;
  433. ssize_t ret = count;
  434. if (sysfs_streq(buf, "always")) {
  435. spin_lock(&huge_anon_orders_lock);
  436. clear_bit(order, &huge_anon_orders_inherit);
  437. clear_bit(order, &huge_anon_orders_madvise);
  438. set_bit(order, &huge_anon_orders_always);
  439. spin_unlock(&huge_anon_orders_lock);
  440. } else if (sysfs_streq(buf, "inherit")) {
  441. spin_lock(&huge_anon_orders_lock);
  442. clear_bit(order, &huge_anon_orders_always);
  443. clear_bit(order, &huge_anon_orders_madvise);
  444. set_bit(order, &huge_anon_orders_inherit);
  445. spin_unlock(&huge_anon_orders_lock);
  446. } else if (sysfs_streq(buf, "madvise")) {
  447. spin_lock(&huge_anon_orders_lock);
  448. clear_bit(order, &huge_anon_orders_always);
  449. clear_bit(order, &huge_anon_orders_inherit);
  450. set_bit(order, &huge_anon_orders_madvise);
  451. spin_unlock(&huge_anon_orders_lock);
  452. } else if (sysfs_streq(buf, "never")) {
  453. spin_lock(&huge_anon_orders_lock);
  454. clear_bit(order, &huge_anon_orders_always);
  455. clear_bit(order, &huge_anon_orders_inherit);
  456. clear_bit(order, &huge_anon_orders_madvise);
  457. spin_unlock(&huge_anon_orders_lock);
  458. } else
  459. ret = -EINVAL;
  460. if (ret > 0) {
  461. int err;
  462. err = start_stop_khugepaged();
  463. if (err)
  464. ret = err;
  465. }
  466. return ret;
  467. }
  468. static struct kobj_attribute anon_enabled_attr =
  469. __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
  470. static struct attribute *anon_ctrl_attrs[] = {
  471. &anon_enabled_attr.attr,
  472. NULL,
  473. };
  474. static const struct attribute_group anon_ctrl_attr_grp = {
  475. .attrs = anon_ctrl_attrs,
  476. };
  477. static struct attribute *file_ctrl_attrs[] = {
  478. #ifdef CONFIG_SHMEM
  479. &thpsize_shmem_enabled_attr.attr,
  480. #endif
  481. NULL,
  482. };
  483. static const struct attribute_group file_ctrl_attr_grp = {
  484. .attrs = file_ctrl_attrs,
  485. };
  486. static struct attribute *any_ctrl_attrs[] = {
  487. NULL,
  488. };
  489. static const struct attribute_group any_ctrl_attr_grp = {
  490. .attrs = any_ctrl_attrs,
  491. };
  492. static const struct kobj_type thpsize_ktype = {
  493. .release = &thpsize_release,
  494. .sysfs_ops = &kobj_sysfs_ops,
  495. };
  496. DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
  497. static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
  498. {
  499. unsigned long sum = 0;
  500. int cpu;
  501. for_each_possible_cpu(cpu) {
  502. struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
  503. sum += this->stats[order][item];
  504. }
  505. return sum;
  506. }
  507. #define DEFINE_MTHP_STAT_ATTR(_name, _index) \
  508. static ssize_t _name##_show(struct kobject *kobj, \
  509. struct kobj_attribute *attr, char *buf) \
  510. { \
  511. int order = to_thpsize(kobj)->order; \
  512. \
  513. return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
  514. } \
  515. static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  516. DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
  517. DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
  518. DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
  519. DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
  520. DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
  521. #ifdef CONFIG_SHMEM
  522. DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
  523. DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
  524. DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
  525. #endif
  526. DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
  527. DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
  528. DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
  529. DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
  530. DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
  531. static struct attribute *anon_stats_attrs[] = {
  532. &anon_fault_alloc_attr.attr,
  533. &anon_fault_fallback_attr.attr,
  534. &anon_fault_fallback_charge_attr.attr,
  535. #ifndef CONFIG_SHMEM
  536. &swpout_attr.attr,
  537. &swpout_fallback_attr.attr,
  538. #endif
  539. &split_deferred_attr.attr,
  540. &nr_anon_attr.attr,
  541. &nr_anon_partially_mapped_attr.attr,
  542. NULL,
  543. };
  544. static struct attribute_group anon_stats_attr_grp = {
  545. .name = "stats",
  546. .attrs = anon_stats_attrs,
  547. };
  548. static struct attribute *file_stats_attrs[] = {
  549. #ifdef CONFIG_SHMEM
  550. &shmem_alloc_attr.attr,
  551. &shmem_fallback_attr.attr,
  552. &shmem_fallback_charge_attr.attr,
  553. #endif
  554. NULL,
  555. };
  556. static struct attribute_group file_stats_attr_grp = {
  557. .name = "stats",
  558. .attrs = file_stats_attrs,
  559. };
  560. static struct attribute *any_stats_attrs[] = {
  561. #ifdef CONFIG_SHMEM
  562. &swpout_attr.attr,
  563. &swpout_fallback_attr.attr,
  564. #endif
  565. &split_attr.attr,
  566. &split_failed_attr.attr,
  567. NULL,
  568. };
  569. static struct attribute_group any_stats_attr_grp = {
  570. .name = "stats",
  571. .attrs = any_stats_attrs,
  572. };
  573. static int sysfs_add_group(struct kobject *kobj,
  574. const struct attribute_group *grp)
  575. {
  576. int ret = -ENOENT;
  577. /*
  578. * If the group is named, try to merge first, assuming the subdirectory
  579. * was already created. This avoids the warning emitted by
  580. * sysfs_create_group() if the directory already exists.
  581. */
  582. if (grp->name)
  583. ret = sysfs_merge_group(kobj, grp);
  584. if (ret)
  585. ret = sysfs_create_group(kobj, grp);
  586. return ret;
  587. }
  588. static struct thpsize *thpsize_create(int order, struct kobject *parent)
  589. {
  590. unsigned long size = (PAGE_SIZE << order) / SZ_1K;
  591. struct thpsize *thpsize;
  592. int ret = -ENOMEM;
  593. thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
  594. if (!thpsize)
  595. goto err;
  596. thpsize->order = order;
  597. ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
  598. "hugepages-%lukB", size);
  599. if (ret) {
  600. kfree(thpsize);
  601. goto err;
  602. }
  603. ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
  604. if (ret)
  605. goto err_put;
  606. ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
  607. if (ret)
  608. goto err_put;
  609. if (BIT(order) & THP_ORDERS_ALL_ANON) {
  610. ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
  611. if (ret)
  612. goto err_put;
  613. ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
  614. if (ret)
  615. goto err_put;
  616. }
  617. if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
  618. ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
  619. if (ret)
  620. goto err_put;
  621. ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
  622. if (ret)
  623. goto err_put;
  624. }
  625. return thpsize;
  626. err_put:
  627. kobject_put(&thpsize->kobj);
  628. err:
  629. return ERR_PTR(ret);
  630. }
  631. static void thpsize_release(struct kobject *kobj)
  632. {
  633. kfree(to_thpsize(kobj));
  634. }
  635. static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
  636. {
  637. int err;
  638. struct thpsize *thpsize;
  639. unsigned long orders;
  640. int order;
  641. /*
  642. * Default to setting PMD-sized THP to inherit the global setting and
  643. * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
  644. * constant so we have to do this here.
  645. */
  646. if (!anon_orders_configured)
  647. huge_anon_orders_inherit = BIT(PMD_ORDER);
  648. *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  649. if (unlikely(!*hugepage_kobj)) {
  650. pr_err("failed to create transparent hugepage kobject\n");
  651. return -ENOMEM;
  652. }
  653. err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
  654. if (err) {
  655. pr_err("failed to register transparent hugepage group\n");
  656. goto delete_obj;
  657. }
  658. err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
  659. if (err) {
  660. pr_err("failed to register transparent hugepage group\n");
  661. goto remove_hp_group;
  662. }
  663. orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
  664. order = highest_order(orders);
  665. while (orders) {
  666. thpsize = thpsize_create(order, *hugepage_kobj);
  667. if (IS_ERR(thpsize)) {
  668. pr_err("failed to create thpsize for order %d\n", order);
  669. err = PTR_ERR(thpsize);
  670. goto remove_all;
  671. }
  672. list_add(&thpsize->node, &thpsize_list);
  673. order = next_order(&orders, order);
  674. }
  675. return 0;
  676. remove_all:
  677. hugepage_exit_sysfs(*hugepage_kobj);
  678. return err;
  679. remove_hp_group:
  680. sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  681. delete_obj:
  682. kobject_put(*hugepage_kobj);
  683. return err;
  684. }
  685. static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  686. {
  687. struct thpsize *thpsize, *tmp;
  688. list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
  689. list_del(&thpsize->node);
  690. kobject_put(&thpsize->kobj);
  691. }
  692. sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  693. sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  694. kobject_put(hugepage_kobj);
  695. }
  696. #else
  697. static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  698. {
  699. return 0;
  700. }
  701. static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  702. {
  703. }
  704. #endif /* CONFIG_SYSFS */
  705. static int __init thp_shrinker_init(void)
  706. {
  707. huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
  708. if (!huge_zero_page_shrinker)
  709. return -ENOMEM;
  710. deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
  711. SHRINKER_MEMCG_AWARE |
  712. SHRINKER_NONSLAB,
  713. "thp-deferred_split");
  714. if (!deferred_split_shrinker) {
  715. shrinker_free(huge_zero_page_shrinker);
  716. return -ENOMEM;
  717. }
  718. huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
  719. huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
  720. shrinker_register(huge_zero_page_shrinker);
  721. deferred_split_shrinker->count_objects = deferred_split_count;
  722. deferred_split_shrinker->scan_objects = deferred_split_scan;
  723. shrinker_register(deferred_split_shrinker);
  724. return 0;
  725. }
  726. static void __init thp_shrinker_exit(void)
  727. {
  728. shrinker_free(huge_zero_page_shrinker);
  729. shrinker_free(deferred_split_shrinker);
  730. }
  731. static int __init hugepage_init(void)
  732. {
  733. int err;
  734. struct kobject *hugepage_kobj;
  735. if (!has_transparent_hugepage()) {
  736. transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
  737. return -EINVAL;
  738. }
  739. /*
  740. * hugepages can't be allocated by the buddy allocator
  741. */
  742. MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
  743. err = hugepage_init_sysfs(&hugepage_kobj);
  744. if (err)
  745. goto err_sysfs;
  746. err = khugepaged_init();
  747. if (err)
  748. goto err_slab;
  749. err = thp_shrinker_init();
  750. if (err)
  751. goto err_shrinker;
  752. /*
  753. * By default disable transparent hugepages on smaller systems,
  754. * where the extra memory used could hurt more than TLB overhead
  755. * is likely to save. The admin can still enable it through /sys.
  756. */
  757. if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
  758. transparent_hugepage_flags = 0;
  759. return 0;
  760. }
  761. err = start_stop_khugepaged();
  762. if (err)
  763. goto err_khugepaged;
  764. return 0;
  765. err_khugepaged:
  766. thp_shrinker_exit();
  767. err_shrinker:
  768. khugepaged_destroy();
  769. err_slab:
  770. hugepage_exit_sysfs(hugepage_kobj);
  771. err_sysfs:
  772. return err;
  773. }
  774. subsys_initcall(hugepage_init);
  775. static int __init setup_transparent_hugepage(char *str)
  776. {
  777. int ret = 0;
  778. if (!str)
  779. goto out;
  780. if (!strcmp(str, "always")) {
  781. set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  782. &transparent_hugepage_flags);
  783. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  784. &transparent_hugepage_flags);
  785. ret = 1;
  786. } else if (!strcmp(str, "madvise")) {
  787. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  788. &transparent_hugepage_flags);
  789. set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  790. &transparent_hugepage_flags);
  791. ret = 1;
  792. } else if (!strcmp(str, "never")) {
  793. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  794. &transparent_hugepage_flags);
  795. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  796. &transparent_hugepage_flags);
  797. ret = 1;
  798. }
  799. out:
  800. if (!ret)
  801. pr_warn("transparent_hugepage= cannot parse, ignored\n");
  802. return ret;
  803. }
  804. __setup("transparent_hugepage=", setup_transparent_hugepage);
  805. static inline int get_order_from_str(const char *size_str)
  806. {
  807. unsigned long size;
  808. char *endptr;
  809. int order;
  810. size = memparse(size_str, &endptr);
  811. if (!is_power_of_2(size))
  812. goto err;
  813. order = get_order(size);
  814. if (BIT(order) & ~THP_ORDERS_ALL_ANON)
  815. goto err;
  816. return order;
  817. err:
  818. pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
  819. return -EINVAL;
  820. }
  821. static char str_dup[PAGE_SIZE] __initdata;
  822. static int __init setup_thp_anon(char *str)
  823. {
  824. char *token, *range, *policy, *subtoken;
  825. unsigned long always, inherit, madvise;
  826. char *start_size, *end_size;
  827. int start, end, nr;
  828. char *p;
  829. if (!str || strlen(str) + 1 > PAGE_SIZE)
  830. goto err;
  831. strcpy(str_dup, str);
  832. always = huge_anon_orders_always;
  833. madvise = huge_anon_orders_madvise;
  834. inherit = huge_anon_orders_inherit;
  835. p = str_dup;
  836. while ((token = strsep(&p, ";")) != NULL) {
  837. range = strsep(&token, ":");
  838. policy = token;
  839. if (!policy)
  840. goto err;
  841. while ((subtoken = strsep(&range, ",")) != NULL) {
  842. if (strchr(subtoken, '-')) {
  843. start_size = strsep(&subtoken, "-");
  844. end_size = subtoken;
  845. start = get_order_from_str(start_size);
  846. end = get_order_from_str(end_size);
  847. } else {
  848. start = end = get_order_from_str(subtoken);
  849. }
  850. if (start < 0 || end < 0 || start > end)
  851. goto err;
  852. nr = end - start + 1;
  853. if (!strcmp(policy, "always")) {
  854. bitmap_set(&always, start, nr);
  855. bitmap_clear(&inherit, start, nr);
  856. bitmap_clear(&madvise, start, nr);
  857. } else if (!strcmp(policy, "madvise")) {
  858. bitmap_set(&madvise, start, nr);
  859. bitmap_clear(&inherit, start, nr);
  860. bitmap_clear(&always, start, nr);
  861. } else if (!strcmp(policy, "inherit")) {
  862. bitmap_set(&inherit, start, nr);
  863. bitmap_clear(&madvise, start, nr);
  864. bitmap_clear(&always, start, nr);
  865. } else if (!strcmp(policy, "never")) {
  866. bitmap_clear(&inherit, start, nr);
  867. bitmap_clear(&madvise, start, nr);
  868. bitmap_clear(&always, start, nr);
  869. } else {
  870. pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
  871. goto err;
  872. }
  873. }
  874. }
  875. huge_anon_orders_always = always;
  876. huge_anon_orders_madvise = madvise;
  877. huge_anon_orders_inherit = inherit;
  878. anon_orders_configured = true;
  879. return 1;
  880. err:
  881. pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
  882. return 0;
  883. }
  884. __setup("thp_anon=", setup_thp_anon);
  885. pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
  886. {
  887. if (likely(vma->vm_flags & VM_WRITE))
  888. pmd = pmd_mkwrite(pmd, vma);
  889. return pmd;
  890. }
  891. #ifdef CONFIG_MEMCG
  892. static inline
  893. struct deferred_split *get_deferred_split_queue(struct folio *folio)
  894. {
  895. struct mem_cgroup *memcg = folio_memcg(folio);
  896. struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
  897. if (memcg)
  898. return &memcg->deferred_split_queue;
  899. else
  900. return &pgdat->deferred_split_queue;
  901. }
  902. #else
  903. static inline
  904. struct deferred_split *get_deferred_split_queue(struct folio *folio)
  905. {
  906. struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
  907. return &pgdat->deferred_split_queue;
  908. }
  909. #endif
  910. static inline bool is_transparent_hugepage(const struct folio *folio)
  911. {
  912. if (!folio_test_large(folio))
  913. return false;
  914. return is_huge_zero_folio(folio) ||
  915. folio_test_large_rmappable(folio);
  916. }
  917. static unsigned long __thp_get_unmapped_area(struct file *filp,
  918. unsigned long addr, unsigned long len,
  919. loff_t off, unsigned long flags, unsigned long size,
  920. vm_flags_t vm_flags)
  921. {
  922. loff_t off_end = off + len;
  923. loff_t off_align = round_up(off, size);
  924. unsigned long len_pad, ret, off_sub;
  925. if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
  926. return 0;
  927. if (off_end <= off_align || (off_end - off_align) < size)
  928. return 0;
  929. len_pad = len + size;
  930. if (len_pad < len || (off + len_pad) < off)
  931. return 0;
  932. ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
  933. off >> PAGE_SHIFT, flags, vm_flags);
  934. /*
  935. * The failure might be due to length padding. The caller will retry
  936. * without the padding.
  937. */
  938. if (IS_ERR_VALUE(ret))
  939. return 0;
  940. /*
  941. * Do not try to align to THP boundary if allocation at the address
  942. * hint succeeds.
  943. */
  944. if (ret == addr)
  945. return addr;
  946. off_sub = (off - ret) & (size - 1);
  947. if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
  948. return ret + size;
  949. ret += off_sub;
  950. return ret;
  951. }
  952. unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
  953. unsigned long len, unsigned long pgoff, unsigned long flags,
  954. vm_flags_t vm_flags)
  955. {
  956. unsigned long ret;
  957. loff_t off = (loff_t)pgoff << PAGE_SHIFT;
  958. ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
  959. if (ret)
  960. return ret;
  961. return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
  962. vm_flags);
  963. }
  964. unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  965. unsigned long len, unsigned long pgoff, unsigned long flags)
  966. {
  967. return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
  968. }
  969. EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
  970. static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  971. struct page *page, gfp_t gfp)
  972. {
  973. struct vm_area_struct *vma = vmf->vma;
  974. struct folio *folio = page_folio(page);
  975. pgtable_t pgtable;
  976. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  977. vm_fault_t ret = 0;
  978. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  979. if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
  980. folio_put(folio);
  981. count_vm_event(THP_FAULT_FALLBACK);
  982. count_vm_event(THP_FAULT_FALLBACK_CHARGE);
  983. count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
  984. count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
  985. return VM_FAULT_FALLBACK;
  986. }
  987. folio_throttle_swaprate(folio, gfp);
  988. pgtable = pte_alloc_one(vma->vm_mm);
  989. if (unlikely(!pgtable)) {
  990. ret = VM_FAULT_OOM;
  991. goto release;
  992. }
  993. folio_zero_user(folio, vmf->address);
  994. /*
  995. * The memory barrier inside __folio_mark_uptodate makes sure that
  996. * folio_zero_user writes become visible before the set_pmd_at()
  997. * write.
  998. */
  999. __folio_mark_uptodate(folio);
  1000. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1001. if (unlikely(!pmd_none(*vmf->pmd))) {
  1002. goto unlock_release;
  1003. } else {
  1004. pmd_t entry;
  1005. ret = check_stable_address_space(vma->vm_mm);
  1006. if (ret)
  1007. goto unlock_release;
  1008. /* Deliver the page fault to userland */
  1009. if (userfaultfd_missing(vma)) {
  1010. spin_unlock(vmf->ptl);
  1011. folio_put(folio);
  1012. pte_free(vma->vm_mm, pgtable);
  1013. ret = handle_userfault(vmf, VM_UFFD_MISSING);
  1014. VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  1015. return ret;
  1016. }
  1017. entry = mk_huge_pmd(page, vma->vm_page_prot);
  1018. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1019. folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
  1020. folio_add_lru_vma(folio, vma);
  1021. pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  1022. set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
  1023. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1024. add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1025. mm_inc_nr_ptes(vma->vm_mm);
  1026. deferred_split_folio(folio, false);
  1027. spin_unlock(vmf->ptl);
  1028. count_vm_event(THP_FAULT_ALLOC);
  1029. count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
  1030. count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
  1031. }
  1032. return 0;
  1033. unlock_release:
  1034. spin_unlock(vmf->ptl);
  1035. release:
  1036. if (pgtable)
  1037. pte_free(vma->vm_mm, pgtable);
  1038. folio_put(folio);
  1039. return ret;
  1040. }
  1041. /*
  1042. * always: directly stall for all thp allocations
  1043. * defer: wake kswapd and fail if not immediately available
  1044. * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
  1045. * fail if not immediately available
  1046. * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
  1047. * available
  1048. * never: never stall for any thp allocation
  1049. */
  1050. gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
  1051. {
  1052. const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
  1053. /* Always do synchronous compaction */
  1054. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  1055. return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
  1056. /* Kick kcompactd and fail quickly */
  1057. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  1058. return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
  1059. /* Synchronous compaction if madvised, otherwise kick kcompactd */
  1060. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  1061. return GFP_TRANSHUGE_LIGHT |
  1062. (vma_madvised ? __GFP_DIRECT_RECLAIM :
  1063. __GFP_KSWAPD_RECLAIM);
  1064. /* Only do synchronous compaction if madvised */
  1065. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  1066. return GFP_TRANSHUGE_LIGHT |
  1067. (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
  1068. return GFP_TRANSHUGE_LIGHT;
  1069. }
  1070. /* Caller must hold page table lock. */
  1071. static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
  1072. struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
  1073. struct folio *zero_folio)
  1074. {
  1075. pmd_t entry;
  1076. if (!pmd_none(*pmd))
  1077. return;
  1078. entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
  1079. entry = pmd_mkhuge(entry);
  1080. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1081. set_pmd_at(mm, haddr, pmd, entry);
  1082. mm_inc_nr_ptes(mm);
  1083. }
  1084. vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
  1085. {
  1086. struct vm_area_struct *vma = vmf->vma;
  1087. gfp_t gfp;
  1088. struct folio *folio;
  1089. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1090. vm_fault_t ret;
  1091. if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
  1092. return VM_FAULT_FALLBACK;
  1093. ret = vmf_anon_prepare(vmf);
  1094. if (ret)
  1095. return ret;
  1096. khugepaged_enter_vma(vma, vma->vm_flags);
  1097. if (!(vmf->flags & FAULT_FLAG_WRITE) &&
  1098. !mm_forbids_zeropage(vma->vm_mm) &&
  1099. transparent_hugepage_use_zero_page()) {
  1100. pgtable_t pgtable;
  1101. struct folio *zero_folio;
  1102. vm_fault_t ret;
  1103. pgtable = pte_alloc_one(vma->vm_mm);
  1104. if (unlikely(!pgtable))
  1105. return VM_FAULT_OOM;
  1106. zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
  1107. if (unlikely(!zero_folio)) {
  1108. pte_free(vma->vm_mm, pgtable);
  1109. count_vm_event(THP_FAULT_FALLBACK);
  1110. return VM_FAULT_FALLBACK;
  1111. }
  1112. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1113. ret = 0;
  1114. if (pmd_none(*vmf->pmd)) {
  1115. ret = check_stable_address_space(vma->vm_mm);
  1116. if (ret) {
  1117. spin_unlock(vmf->ptl);
  1118. pte_free(vma->vm_mm, pgtable);
  1119. } else if (userfaultfd_missing(vma)) {
  1120. spin_unlock(vmf->ptl);
  1121. pte_free(vma->vm_mm, pgtable);
  1122. ret = handle_userfault(vmf, VM_UFFD_MISSING);
  1123. VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  1124. } else {
  1125. set_huge_zero_folio(pgtable, vma->vm_mm, vma,
  1126. haddr, vmf->pmd, zero_folio);
  1127. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1128. spin_unlock(vmf->ptl);
  1129. }
  1130. } else {
  1131. spin_unlock(vmf->ptl);
  1132. pte_free(vma->vm_mm, pgtable);
  1133. }
  1134. return ret;
  1135. }
  1136. gfp = vma_thp_gfp_mask(vma);
  1137. folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
  1138. if (unlikely(!folio)) {
  1139. count_vm_event(THP_FAULT_FALLBACK);
  1140. count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
  1141. return VM_FAULT_FALLBACK;
  1142. }
  1143. return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
  1144. }
  1145. static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
  1146. pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
  1147. pgtable_t pgtable)
  1148. {
  1149. struct mm_struct *mm = vma->vm_mm;
  1150. pmd_t entry;
  1151. spinlock_t *ptl;
  1152. ptl = pmd_lock(mm, pmd);
  1153. if (!pmd_none(*pmd)) {
  1154. if (write) {
  1155. if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
  1156. WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
  1157. goto out_unlock;
  1158. }
  1159. entry = pmd_mkyoung(*pmd);
  1160. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1161. if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
  1162. update_mmu_cache_pmd(vma, addr, pmd);
  1163. }
  1164. goto out_unlock;
  1165. }
  1166. entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  1167. if (pfn_t_devmap(pfn))
  1168. entry = pmd_mkdevmap(entry);
  1169. else
  1170. entry = pmd_mkspecial(entry);
  1171. if (write) {
  1172. entry = pmd_mkyoung(pmd_mkdirty(entry));
  1173. entry = maybe_pmd_mkwrite(entry, vma);
  1174. }
  1175. if (pgtable) {
  1176. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1177. mm_inc_nr_ptes(mm);
  1178. pgtable = NULL;
  1179. }
  1180. set_pmd_at(mm, addr, pmd, entry);
  1181. update_mmu_cache_pmd(vma, addr, pmd);
  1182. out_unlock:
  1183. spin_unlock(ptl);
  1184. if (pgtable)
  1185. pte_free(mm, pgtable);
  1186. }
  1187. /**
  1188. * vmf_insert_pfn_pmd - insert a pmd size pfn
  1189. * @vmf: Structure describing the fault
  1190. * @pfn: pfn to insert
  1191. * @write: whether it's a write fault
  1192. *
  1193. * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
  1194. *
  1195. * Return: vm_fault_t value.
  1196. */
  1197. vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
  1198. {
  1199. unsigned long addr = vmf->address & PMD_MASK;
  1200. struct vm_area_struct *vma = vmf->vma;
  1201. pgprot_t pgprot = vma->vm_page_prot;
  1202. pgtable_t pgtable = NULL;
  1203. /*
  1204. * If we had pmd_special, we could avoid all these restrictions,
  1205. * but we need to be consistent with PTEs and architectures that
  1206. * can't support a 'special' bit.
  1207. */
  1208. BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  1209. !pfn_t_devmap(pfn));
  1210. BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  1211. (VM_PFNMAP|VM_MIXEDMAP));
  1212. BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  1213. if (addr < vma->vm_start || addr >= vma->vm_end)
  1214. return VM_FAULT_SIGBUS;
  1215. if (arch_needs_pgtable_deposit()) {
  1216. pgtable = pte_alloc_one(vma->vm_mm);
  1217. if (!pgtable)
  1218. return VM_FAULT_OOM;
  1219. }
  1220. track_pfn_insert(vma, &pgprot, pfn);
  1221. insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
  1222. return VM_FAULT_NOPAGE;
  1223. }
  1224. EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
  1225. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  1226. static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
  1227. {
  1228. if (likely(vma->vm_flags & VM_WRITE))
  1229. pud = pud_mkwrite(pud);
  1230. return pud;
  1231. }
  1232. static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
  1233. pud_t *pud, pfn_t pfn, bool write)
  1234. {
  1235. struct mm_struct *mm = vma->vm_mm;
  1236. pgprot_t prot = vma->vm_page_prot;
  1237. pud_t entry;
  1238. spinlock_t *ptl;
  1239. ptl = pud_lock(mm, pud);
  1240. if (!pud_none(*pud)) {
  1241. if (write) {
  1242. if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
  1243. goto out_unlock;
  1244. entry = pud_mkyoung(*pud);
  1245. entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
  1246. if (pudp_set_access_flags(vma, addr, pud, entry, 1))
  1247. update_mmu_cache_pud(vma, addr, pud);
  1248. }
  1249. goto out_unlock;
  1250. }
  1251. entry = pud_mkhuge(pfn_t_pud(pfn, prot));
  1252. if (pfn_t_devmap(pfn))
  1253. entry = pud_mkdevmap(entry);
  1254. else
  1255. entry = pud_mkspecial(entry);
  1256. if (write) {
  1257. entry = pud_mkyoung(pud_mkdirty(entry));
  1258. entry = maybe_pud_mkwrite(entry, vma);
  1259. }
  1260. set_pud_at(mm, addr, pud, entry);
  1261. update_mmu_cache_pud(vma, addr, pud);
  1262. out_unlock:
  1263. spin_unlock(ptl);
  1264. }
  1265. /**
  1266. * vmf_insert_pfn_pud - insert a pud size pfn
  1267. * @vmf: Structure describing the fault
  1268. * @pfn: pfn to insert
  1269. * @write: whether it's a write fault
  1270. *
  1271. * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
  1272. *
  1273. * Return: vm_fault_t value.
  1274. */
  1275. vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
  1276. {
  1277. unsigned long addr = vmf->address & PUD_MASK;
  1278. struct vm_area_struct *vma = vmf->vma;
  1279. pgprot_t pgprot = vma->vm_page_prot;
  1280. /*
  1281. * If we had pud_special, we could avoid all these restrictions,
  1282. * but we need to be consistent with PTEs and architectures that
  1283. * can't support a 'special' bit.
  1284. */
  1285. BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  1286. !pfn_t_devmap(pfn));
  1287. BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  1288. (VM_PFNMAP|VM_MIXEDMAP));
  1289. BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  1290. if (addr < vma->vm_start || addr >= vma->vm_end)
  1291. return VM_FAULT_SIGBUS;
  1292. track_pfn_insert(vma, &pgprot, pfn);
  1293. insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
  1294. return VM_FAULT_NOPAGE;
  1295. }
  1296. EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
  1297. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  1298. void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
  1299. pmd_t *pmd, bool write)
  1300. {
  1301. pmd_t _pmd;
  1302. _pmd = pmd_mkyoung(*pmd);
  1303. if (write)
  1304. _pmd = pmd_mkdirty(_pmd);
  1305. if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
  1306. pmd, _pmd, write))
  1307. update_mmu_cache_pmd(vma, addr, pmd);
  1308. }
  1309. struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
  1310. pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
  1311. {
  1312. unsigned long pfn = pmd_pfn(*pmd);
  1313. struct mm_struct *mm = vma->vm_mm;
  1314. struct page *page;
  1315. int ret;
  1316. assert_spin_locked(pmd_lockptr(mm, pmd));
  1317. if (flags & FOLL_WRITE && !pmd_write(*pmd))
  1318. return NULL;
  1319. if (pmd_present(*pmd) && pmd_devmap(*pmd))
  1320. /* pass */;
  1321. else
  1322. return NULL;
  1323. if (flags & FOLL_TOUCH)
  1324. touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
  1325. /*
  1326. * device mapped pages can only be returned if the
  1327. * caller will manage the page reference count.
  1328. */
  1329. if (!(flags & (FOLL_GET | FOLL_PIN)))
  1330. return ERR_PTR(-EEXIST);
  1331. pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
  1332. *pgmap = get_dev_pagemap(pfn, *pgmap);
  1333. if (!*pgmap)
  1334. return ERR_PTR(-EFAULT);
  1335. page = pfn_to_page(pfn);
  1336. ret = try_grab_folio(page_folio(page), 1, flags);
  1337. if (ret)
  1338. page = ERR_PTR(ret);
  1339. return page;
  1340. }
  1341. int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  1342. pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  1343. struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
  1344. {
  1345. spinlock_t *dst_ptl, *src_ptl;
  1346. struct page *src_page;
  1347. struct folio *src_folio;
  1348. pmd_t pmd;
  1349. pgtable_t pgtable = NULL;
  1350. int ret = -ENOMEM;
  1351. pmd = pmdp_get_lockless(src_pmd);
  1352. if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
  1353. dst_ptl = pmd_lock(dst_mm, dst_pmd);
  1354. src_ptl = pmd_lockptr(src_mm, src_pmd);
  1355. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1356. /*
  1357. * No need to recheck the pmd, it can't change with write
  1358. * mmap lock held here.
  1359. *
  1360. * Meanwhile, making sure it's not a CoW VMA with writable
  1361. * mapping, otherwise it means either the anon page wrongly
  1362. * applied special bit, or we made the PRIVATE mapping be
  1363. * able to wrongly write to the backend MMIO.
  1364. */
  1365. VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
  1366. goto set_pmd;
  1367. }
  1368. /* Skip if can be re-fill on fault */
  1369. if (!vma_is_anonymous(dst_vma))
  1370. return 0;
  1371. pgtable = pte_alloc_one(dst_mm);
  1372. if (unlikely(!pgtable))
  1373. goto out;
  1374. dst_ptl = pmd_lock(dst_mm, dst_pmd);
  1375. src_ptl = pmd_lockptr(src_mm, src_pmd);
  1376. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1377. ret = -EAGAIN;
  1378. pmd = *src_pmd;
  1379. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  1380. if (unlikely(is_swap_pmd(pmd))) {
  1381. swp_entry_t entry = pmd_to_swp_entry(pmd);
  1382. VM_BUG_ON(!is_pmd_migration_entry(pmd));
  1383. if (!is_readable_migration_entry(entry)) {
  1384. entry = make_readable_migration_entry(
  1385. swp_offset(entry));
  1386. pmd = swp_entry_to_pmd(entry);
  1387. if (pmd_swp_soft_dirty(*src_pmd))
  1388. pmd = pmd_swp_mksoft_dirty(pmd);
  1389. if (pmd_swp_uffd_wp(*src_pmd))
  1390. pmd = pmd_swp_mkuffd_wp(pmd);
  1391. set_pmd_at(src_mm, addr, src_pmd, pmd);
  1392. }
  1393. add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1394. mm_inc_nr_ptes(dst_mm);
  1395. pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  1396. if (!userfaultfd_wp(dst_vma))
  1397. pmd = pmd_swp_clear_uffd_wp(pmd);
  1398. set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  1399. ret = 0;
  1400. goto out_unlock;
  1401. }
  1402. #endif
  1403. if (unlikely(!pmd_trans_huge(pmd))) {
  1404. pte_free(dst_mm, pgtable);
  1405. goto out_unlock;
  1406. }
  1407. /*
  1408. * When page table lock is held, the huge zero pmd should not be
  1409. * under splitting since we don't split the page itself, only pmd to
  1410. * a page table.
  1411. */
  1412. if (is_huge_zero_pmd(pmd)) {
  1413. /*
  1414. * mm_get_huge_zero_folio() will never allocate a new
  1415. * folio here, since we already have a zero page to
  1416. * copy. It just takes a reference.
  1417. */
  1418. mm_get_huge_zero_folio(dst_mm);
  1419. goto out_zero_page;
  1420. }
  1421. src_page = pmd_page(pmd);
  1422. VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  1423. src_folio = page_folio(src_page);
  1424. folio_get(src_folio);
  1425. if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
  1426. /* Page maybe pinned: split and retry the fault on PTEs. */
  1427. folio_put(src_folio);
  1428. pte_free(dst_mm, pgtable);
  1429. spin_unlock(src_ptl);
  1430. spin_unlock(dst_ptl);
  1431. __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
  1432. return -EAGAIN;
  1433. }
  1434. add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1435. out_zero_page:
  1436. mm_inc_nr_ptes(dst_mm);
  1437. pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  1438. pmdp_set_wrprotect(src_mm, addr, src_pmd);
  1439. if (!userfaultfd_wp(dst_vma))
  1440. pmd = pmd_clear_uffd_wp(pmd);
  1441. pmd = pmd_wrprotect(pmd);
  1442. set_pmd:
  1443. pmd = pmd_mkold(pmd);
  1444. set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  1445. ret = 0;
  1446. out_unlock:
  1447. spin_unlock(src_ptl);
  1448. spin_unlock(dst_ptl);
  1449. out:
  1450. return ret;
  1451. }
  1452. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  1453. void touch_pud(struct vm_area_struct *vma, unsigned long addr,
  1454. pud_t *pud, bool write)
  1455. {
  1456. pud_t _pud;
  1457. _pud = pud_mkyoung(*pud);
  1458. if (write)
  1459. _pud = pud_mkdirty(_pud);
  1460. if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
  1461. pud, _pud, write))
  1462. update_mmu_cache_pud(vma, addr, pud);
  1463. }
  1464. int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  1465. pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
  1466. struct vm_area_struct *vma)
  1467. {
  1468. spinlock_t *dst_ptl, *src_ptl;
  1469. pud_t pud;
  1470. int ret;
  1471. dst_ptl = pud_lock(dst_mm, dst_pud);
  1472. src_ptl = pud_lockptr(src_mm, src_pud);
  1473. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1474. ret = -EAGAIN;
  1475. pud = *src_pud;
  1476. if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
  1477. goto out_unlock;
  1478. /*
  1479. * TODO: once we support anonymous pages, use
  1480. * folio_try_dup_anon_rmap_*() and split if duplicating fails.
  1481. */
  1482. if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
  1483. pudp_set_wrprotect(src_mm, addr, src_pud);
  1484. pud = pud_wrprotect(pud);
  1485. }
  1486. pud = pud_mkold(pud);
  1487. set_pud_at(dst_mm, addr, dst_pud, pud);
  1488. ret = 0;
  1489. out_unlock:
  1490. spin_unlock(src_ptl);
  1491. spin_unlock(dst_ptl);
  1492. return ret;
  1493. }
  1494. void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
  1495. {
  1496. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1497. vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
  1498. if (unlikely(!pud_same(*vmf->pud, orig_pud)))
  1499. goto unlock;
  1500. touch_pud(vmf->vma, vmf->address, vmf->pud, write);
  1501. unlock:
  1502. spin_unlock(vmf->ptl);
  1503. }
  1504. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  1505. void huge_pmd_set_accessed(struct vm_fault *vmf)
  1506. {
  1507. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1508. vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  1509. if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
  1510. goto unlock;
  1511. touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
  1512. unlock:
  1513. spin_unlock(vmf->ptl);
  1514. }
  1515. vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
  1516. {
  1517. const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
  1518. struct vm_area_struct *vma = vmf->vma;
  1519. struct folio *folio;
  1520. struct page *page;
  1521. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1522. pmd_t orig_pmd = vmf->orig_pmd;
  1523. vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
  1524. VM_BUG_ON_VMA(!vma->anon_vma, vma);
  1525. if (is_huge_zero_pmd(orig_pmd))
  1526. goto fallback;
  1527. spin_lock(vmf->ptl);
  1528. if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  1529. spin_unlock(vmf->ptl);
  1530. return 0;
  1531. }
  1532. page = pmd_page(orig_pmd);
  1533. folio = page_folio(page);
  1534. VM_BUG_ON_PAGE(!PageHead(page), page);
  1535. /* Early check when only holding the PT lock. */
  1536. if (PageAnonExclusive(page))
  1537. goto reuse;
  1538. if (!folio_trylock(folio)) {
  1539. folio_get(folio);
  1540. spin_unlock(vmf->ptl);
  1541. folio_lock(folio);
  1542. spin_lock(vmf->ptl);
  1543. if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  1544. spin_unlock(vmf->ptl);
  1545. folio_unlock(folio);
  1546. folio_put(folio);
  1547. return 0;
  1548. }
  1549. folio_put(folio);
  1550. }
  1551. /* Recheck after temporarily dropping the PT lock. */
  1552. if (PageAnonExclusive(page)) {
  1553. folio_unlock(folio);
  1554. goto reuse;
  1555. }
  1556. /*
  1557. * See do_wp_page(): we can only reuse the folio exclusively if
  1558. * there are no additional references. Note that we always drain
  1559. * the LRU cache immediately after adding a THP.
  1560. */
  1561. if (folio_ref_count(folio) >
  1562. 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
  1563. goto unlock_fallback;
  1564. if (folio_test_swapcache(folio))
  1565. folio_free_swap(folio);
  1566. if (folio_ref_count(folio) == 1) {
  1567. pmd_t entry;
  1568. folio_move_anon_rmap(folio, vma);
  1569. SetPageAnonExclusive(page);
  1570. folio_unlock(folio);
  1571. reuse:
  1572. if (unlikely(unshare)) {
  1573. spin_unlock(vmf->ptl);
  1574. return 0;
  1575. }
  1576. entry = pmd_mkyoung(orig_pmd);
  1577. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1578. if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
  1579. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1580. spin_unlock(vmf->ptl);
  1581. return 0;
  1582. }
  1583. unlock_fallback:
  1584. folio_unlock(folio);
  1585. spin_unlock(vmf->ptl);
  1586. fallback:
  1587. __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
  1588. return VM_FAULT_FALLBACK;
  1589. }
  1590. static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
  1591. unsigned long addr, pmd_t pmd)
  1592. {
  1593. struct page *page;
  1594. if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
  1595. return false;
  1596. /* Don't touch entries that are not even readable (NUMA hinting). */
  1597. if (pmd_protnone(pmd))
  1598. return false;
  1599. /* Do we need write faults for softdirty tracking? */
  1600. if (pmd_needs_soft_dirty_wp(vma, pmd))
  1601. return false;
  1602. /* Do we need write faults for uffd-wp tracking? */
  1603. if (userfaultfd_huge_pmd_wp(vma, pmd))
  1604. return false;
  1605. if (!(vma->vm_flags & VM_SHARED)) {
  1606. /* See can_change_pte_writable(). */
  1607. page = vm_normal_page_pmd(vma, addr, pmd);
  1608. return page && PageAnon(page) && PageAnonExclusive(page);
  1609. }
  1610. /* See can_change_pte_writable(). */
  1611. return pmd_dirty(pmd);
  1612. }
  1613. /* NUMA hinting page fault entry point for trans huge pmds */
  1614. vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
  1615. {
  1616. struct vm_area_struct *vma = vmf->vma;
  1617. struct folio *folio;
  1618. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1619. int nid = NUMA_NO_NODE;
  1620. int target_nid, last_cpupid;
  1621. pmd_t pmd, old_pmd;
  1622. bool writable = false;
  1623. int flags = 0;
  1624. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1625. old_pmd = pmdp_get(vmf->pmd);
  1626. if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
  1627. spin_unlock(vmf->ptl);
  1628. return 0;
  1629. }
  1630. pmd = pmd_modify(old_pmd, vma->vm_page_prot);
  1631. /*
  1632. * Detect now whether the PMD could be writable; this information
  1633. * is only valid while holding the PT lock.
  1634. */
  1635. writable = pmd_write(pmd);
  1636. if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
  1637. can_change_pmd_writable(vma, vmf->address, pmd))
  1638. writable = true;
  1639. folio = vm_normal_folio_pmd(vma, haddr, pmd);
  1640. if (!folio)
  1641. goto out_map;
  1642. nid = folio_nid(folio);
  1643. target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
  1644. &last_cpupid);
  1645. if (target_nid == NUMA_NO_NODE)
  1646. goto out_map;
  1647. if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
  1648. flags |= TNF_MIGRATE_FAIL;
  1649. goto out_map;
  1650. }
  1651. /* The folio is isolated and isolation code holds a folio reference. */
  1652. spin_unlock(vmf->ptl);
  1653. writable = false;
  1654. if (!migrate_misplaced_folio(folio, vma, target_nid)) {
  1655. flags |= TNF_MIGRATED;
  1656. nid = target_nid;
  1657. task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
  1658. return 0;
  1659. }
  1660. flags |= TNF_MIGRATE_FAIL;
  1661. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1662. if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
  1663. spin_unlock(vmf->ptl);
  1664. return 0;
  1665. }
  1666. out_map:
  1667. /* Restore the PMD */
  1668. pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
  1669. pmd = pmd_mkyoung(pmd);
  1670. if (writable)
  1671. pmd = pmd_mkwrite(pmd, vma);
  1672. set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
  1673. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1674. spin_unlock(vmf->ptl);
  1675. if (nid != NUMA_NO_NODE)
  1676. task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
  1677. return 0;
  1678. }
  1679. /*
  1680. * Return true if we do MADV_FREE successfully on entire pmd page.
  1681. * Otherwise, return false.
  1682. */
  1683. bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  1684. pmd_t *pmd, unsigned long addr, unsigned long next)
  1685. {
  1686. spinlock_t *ptl;
  1687. pmd_t orig_pmd;
  1688. struct folio *folio;
  1689. struct mm_struct *mm = tlb->mm;
  1690. bool ret = false;
  1691. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  1692. ptl = pmd_trans_huge_lock(pmd, vma);
  1693. if (!ptl)
  1694. goto out_unlocked;
  1695. orig_pmd = *pmd;
  1696. if (is_huge_zero_pmd(orig_pmd))
  1697. goto out;
  1698. if (unlikely(!pmd_present(orig_pmd))) {
  1699. VM_BUG_ON(thp_migration_supported() &&
  1700. !is_pmd_migration_entry(orig_pmd));
  1701. goto out;
  1702. }
  1703. folio = pmd_folio(orig_pmd);
  1704. /*
  1705. * If other processes are mapping this folio, we couldn't discard
  1706. * the folio unless they all do MADV_FREE so let's skip the folio.
  1707. */
  1708. if (folio_likely_mapped_shared(folio))
  1709. goto out;
  1710. if (!folio_trylock(folio))
  1711. goto out;
  1712. /*
  1713. * If user want to discard part-pages of THP, split it so MADV_FREE
  1714. * will deactivate only them.
  1715. */
  1716. if (next - addr != HPAGE_PMD_SIZE) {
  1717. folio_get(folio);
  1718. spin_unlock(ptl);
  1719. split_folio(folio);
  1720. folio_unlock(folio);
  1721. folio_put(folio);
  1722. goto out_unlocked;
  1723. }
  1724. if (folio_test_dirty(folio))
  1725. folio_clear_dirty(folio);
  1726. folio_unlock(folio);
  1727. if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
  1728. pmdp_invalidate(vma, addr, pmd);
  1729. orig_pmd = pmd_mkold(orig_pmd);
  1730. orig_pmd = pmd_mkclean(orig_pmd);
  1731. set_pmd_at(mm, addr, pmd, orig_pmd);
  1732. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  1733. }
  1734. folio_mark_lazyfree(folio);
  1735. ret = true;
  1736. out:
  1737. spin_unlock(ptl);
  1738. out_unlocked:
  1739. return ret;
  1740. }
  1741. static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  1742. {
  1743. pgtable_t pgtable;
  1744. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  1745. pte_free(mm, pgtable);
  1746. mm_dec_nr_ptes(mm);
  1747. }
  1748. int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  1749. pmd_t *pmd, unsigned long addr)
  1750. {
  1751. pmd_t orig_pmd;
  1752. spinlock_t *ptl;
  1753. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  1754. ptl = __pmd_trans_huge_lock(pmd, vma);
  1755. if (!ptl)
  1756. return 0;
  1757. /*
  1758. * For architectures like ppc64 we look at deposited pgtable
  1759. * when calling pmdp_huge_get_and_clear. So do the
  1760. * pgtable_trans_huge_withdraw after finishing pmdp related
  1761. * operations.
  1762. */
  1763. orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
  1764. tlb->fullmm);
  1765. arch_check_zapped_pmd(vma, orig_pmd);
  1766. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  1767. if (vma_is_special_huge(vma)) {
  1768. if (arch_needs_pgtable_deposit())
  1769. zap_deposited_table(tlb->mm, pmd);
  1770. spin_unlock(ptl);
  1771. } else if (is_huge_zero_pmd(orig_pmd)) {
  1772. zap_deposited_table(tlb->mm, pmd);
  1773. spin_unlock(ptl);
  1774. } else {
  1775. struct folio *folio = NULL;
  1776. int flush_needed = 1;
  1777. if (pmd_present(orig_pmd)) {
  1778. struct page *page = pmd_page(orig_pmd);
  1779. folio = page_folio(page);
  1780. folio_remove_rmap_pmd(folio, page, vma);
  1781. WARN_ON_ONCE(folio_mapcount(folio) < 0);
  1782. VM_BUG_ON_PAGE(!PageHead(page), page);
  1783. } else if (thp_migration_supported()) {
  1784. swp_entry_t entry;
  1785. VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
  1786. entry = pmd_to_swp_entry(orig_pmd);
  1787. folio = pfn_swap_entry_folio(entry);
  1788. flush_needed = 0;
  1789. } else
  1790. WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
  1791. if (folio_test_anon(folio)) {
  1792. zap_deposited_table(tlb->mm, pmd);
  1793. add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  1794. } else {
  1795. if (arch_needs_pgtable_deposit())
  1796. zap_deposited_table(tlb->mm, pmd);
  1797. add_mm_counter(tlb->mm, mm_counter_file(folio),
  1798. -HPAGE_PMD_NR);
  1799. }
  1800. spin_unlock(ptl);
  1801. if (flush_needed)
  1802. tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
  1803. }
  1804. return 1;
  1805. }
  1806. #ifndef pmd_move_must_withdraw
  1807. static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
  1808. spinlock_t *old_pmd_ptl,
  1809. struct vm_area_struct *vma)
  1810. {
  1811. /*
  1812. * With split pmd lock we also need to move preallocated
  1813. * PTE page table if new_pmd is on different PMD page table.
  1814. *
  1815. * We also don't deposit and withdraw tables for file pages.
  1816. */
  1817. return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
  1818. }
  1819. #endif
  1820. static pmd_t move_soft_dirty_pmd(pmd_t pmd)
  1821. {
  1822. #ifdef CONFIG_MEM_SOFT_DIRTY
  1823. if (unlikely(is_pmd_migration_entry(pmd)))
  1824. pmd = pmd_swp_mksoft_dirty(pmd);
  1825. else if (pmd_present(pmd))
  1826. pmd = pmd_mksoft_dirty(pmd);
  1827. #endif
  1828. return pmd;
  1829. }
  1830. static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
  1831. {
  1832. if (pmd_present(pmd))
  1833. pmd = pmd_clear_uffd_wp(pmd);
  1834. else if (is_swap_pmd(pmd))
  1835. pmd = pmd_swp_clear_uffd_wp(pmd);
  1836. return pmd;
  1837. }
  1838. bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  1839. unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
  1840. {
  1841. spinlock_t *old_ptl, *new_ptl;
  1842. pmd_t pmd;
  1843. struct mm_struct *mm = vma->vm_mm;
  1844. bool force_flush = false;
  1845. /*
  1846. * The destination pmd shouldn't be established, free_pgtables()
  1847. * should have released it; but move_page_tables() might have already
  1848. * inserted a page table, if racing against shmem/file collapse.
  1849. */
  1850. if (!pmd_none(*new_pmd)) {
  1851. VM_BUG_ON(pmd_trans_huge(*new_pmd));
  1852. return false;
  1853. }
  1854. /*
  1855. * We don't have to worry about the ordering of src and dst
  1856. * ptlocks because exclusive mmap_lock prevents deadlock.
  1857. */
  1858. old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  1859. if (old_ptl) {
  1860. new_ptl = pmd_lockptr(mm, new_pmd);
  1861. if (new_ptl != old_ptl)
  1862. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  1863. pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
  1864. if (pmd_present(pmd))
  1865. force_flush = true;
  1866. VM_BUG_ON(!pmd_none(*new_pmd));
  1867. if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
  1868. pgtable_t pgtable;
  1869. pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  1870. pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
  1871. }
  1872. pmd = move_soft_dirty_pmd(pmd);
  1873. if (vma_has_uffd_without_event_remap(vma))
  1874. pmd = clear_uffd_wp_pmd(pmd);
  1875. set_pmd_at(mm, new_addr, new_pmd, pmd);
  1876. if (force_flush)
  1877. flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
  1878. if (new_ptl != old_ptl)
  1879. spin_unlock(new_ptl);
  1880. spin_unlock(old_ptl);
  1881. return true;
  1882. }
  1883. return false;
  1884. }
  1885. /*
  1886. * Returns
  1887. * - 0 if PMD could not be locked
  1888. * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
  1889. * or if prot_numa but THP migration is not supported
  1890. * - HPAGE_PMD_NR if protections changed and TLB flush necessary
  1891. */
  1892. int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  1893. pmd_t *pmd, unsigned long addr, pgprot_t newprot,
  1894. unsigned long cp_flags)
  1895. {
  1896. struct mm_struct *mm = vma->vm_mm;
  1897. spinlock_t *ptl;
  1898. pmd_t oldpmd, entry;
  1899. bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
  1900. bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
  1901. bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  1902. int ret = 1;
  1903. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  1904. if (prot_numa && !thp_migration_supported())
  1905. return 1;
  1906. ptl = __pmd_trans_huge_lock(pmd, vma);
  1907. if (!ptl)
  1908. return 0;
  1909. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  1910. if (is_swap_pmd(*pmd)) {
  1911. swp_entry_t entry = pmd_to_swp_entry(*pmd);
  1912. struct folio *folio = pfn_swap_entry_folio(entry);
  1913. pmd_t newpmd;
  1914. VM_BUG_ON(!is_pmd_migration_entry(*pmd));
  1915. if (is_writable_migration_entry(entry)) {
  1916. /*
  1917. * A protection check is difficult so
  1918. * just be safe and disable write
  1919. */
  1920. if (folio_test_anon(folio))
  1921. entry = make_readable_exclusive_migration_entry(swp_offset(entry));
  1922. else
  1923. entry = make_readable_migration_entry(swp_offset(entry));
  1924. newpmd = swp_entry_to_pmd(entry);
  1925. if (pmd_swp_soft_dirty(*pmd))
  1926. newpmd = pmd_swp_mksoft_dirty(newpmd);
  1927. } else {
  1928. newpmd = *pmd;
  1929. }
  1930. if (uffd_wp)
  1931. newpmd = pmd_swp_mkuffd_wp(newpmd);
  1932. else if (uffd_wp_resolve)
  1933. newpmd = pmd_swp_clear_uffd_wp(newpmd);
  1934. if (!pmd_same(*pmd, newpmd))
  1935. set_pmd_at(mm, addr, pmd, newpmd);
  1936. goto unlock;
  1937. }
  1938. #endif
  1939. if (prot_numa) {
  1940. struct folio *folio;
  1941. bool toptier;
  1942. /*
  1943. * Avoid trapping faults against the zero page. The read-only
  1944. * data is likely to be read-cached on the local CPU and
  1945. * local/remote hits to the zero page are not interesting.
  1946. */
  1947. if (is_huge_zero_pmd(*pmd))
  1948. goto unlock;
  1949. if (pmd_protnone(*pmd))
  1950. goto unlock;
  1951. folio = pmd_folio(*pmd);
  1952. toptier = node_is_toptier(folio_nid(folio));
  1953. /*
  1954. * Skip scanning top tier node if normal numa
  1955. * balancing is disabled
  1956. */
  1957. if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
  1958. toptier)
  1959. goto unlock;
  1960. if (folio_use_access_time(folio))
  1961. folio_xchg_access_time(folio,
  1962. jiffies_to_msecs(jiffies));
  1963. }
  1964. /*
  1965. * In case prot_numa, we are under mmap_read_lock(mm). It's critical
  1966. * to not clear pmd intermittently to avoid race with MADV_DONTNEED
  1967. * which is also under mmap_read_lock(mm):
  1968. *
  1969. * CPU0: CPU1:
  1970. * change_huge_pmd(prot_numa=1)
  1971. * pmdp_huge_get_and_clear_notify()
  1972. * madvise_dontneed()
  1973. * zap_pmd_range()
  1974. * pmd_trans_huge(*pmd) == 0 (without ptl)
  1975. * // skip the pmd
  1976. * set_pmd_at();
  1977. * // pmd is re-established
  1978. *
  1979. * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
  1980. * which may break userspace.
  1981. *
  1982. * pmdp_invalidate_ad() is required to make sure we don't miss
  1983. * dirty/young flags set by hardware.
  1984. */
  1985. oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
  1986. entry = pmd_modify(oldpmd, newprot);
  1987. if (uffd_wp)
  1988. entry = pmd_mkuffd_wp(entry);
  1989. else if (uffd_wp_resolve)
  1990. /*
  1991. * Leave the write bit to be handled by PF interrupt
  1992. * handler, then things like COW could be properly
  1993. * handled.
  1994. */
  1995. entry = pmd_clear_uffd_wp(entry);
  1996. /* See change_pte_range(). */
  1997. if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
  1998. can_change_pmd_writable(vma, addr, entry))
  1999. entry = pmd_mkwrite(entry, vma);
  2000. ret = HPAGE_PMD_NR;
  2001. set_pmd_at(mm, addr, pmd, entry);
  2002. if (huge_pmd_needs_flush(oldpmd, entry))
  2003. tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
  2004. unlock:
  2005. spin_unlock(ptl);
  2006. return ret;
  2007. }
  2008. /*
  2009. * Returns:
  2010. *
  2011. * - 0: if pud leaf changed from under us
  2012. * - 1: if pud can be skipped
  2013. * - HPAGE_PUD_NR: if pud was successfully processed
  2014. */
  2015. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  2016. int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2017. pud_t *pudp, unsigned long addr, pgprot_t newprot,
  2018. unsigned long cp_flags)
  2019. {
  2020. struct mm_struct *mm = vma->vm_mm;
  2021. pud_t oldpud, entry;
  2022. spinlock_t *ptl;
  2023. tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
  2024. /* NUMA balancing doesn't apply to dax */
  2025. if (cp_flags & MM_CP_PROT_NUMA)
  2026. return 1;
  2027. /*
  2028. * Huge entries on userfault-wp only works with anonymous, while we
  2029. * don't have anonymous PUDs yet.
  2030. */
  2031. if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
  2032. return 1;
  2033. ptl = __pud_trans_huge_lock(pudp, vma);
  2034. if (!ptl)
  2035. return 0;
  2036. /*
  2037. * Can't clear PUD or it can race with concurrent zapping. See
  2038. * change_huge_pmd().
  2039. */
  2040. oldpud = pudp_invalidate(vma, addr, pudp);
  2041. entry = pud_modify(oldpud, newprot);
  2042. set_pud_at(mm, addr, pudp, entry);
  2043. tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
  2044. spin_unlock(ptl);
  2045. return HPAGE_PUD_NR;
  2046. }
  2047. #endif
  2048. #ifdef CONFIG_USERFAULTFD
  2049. /*
  2050. * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
  2051. * the caller, but it must return after releasing the page_table_lock.
  2052. * Just move the page from src_pmd to dst_pmd if possible.
  2053. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
  2054. * repeated by the caller, or other errors in case of failure.
  2055. */
  2056. int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
  2057. struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
  2058. unsigned long dst_addr, unsigned long src_addr)
  2059. {
  2060. pmd_t _dst_pmd, src_pmdval;
  2061. struct page *src_page;
  2062. struct folio *src_folio;
  2063. struct anon_vma *src_anon_vma;
  2064. spinlock_t *src_ptl, *dst_ptl;
  2065. pgtable_t src_pgtable;
  2066. struct mmu_notifier_range range;
  2067. int err = 0;
  2068. src_pmdval = *src_pmd;
  2069. src_ptl = pmd_lockptr(mm, src_pmd);
  2070. lockdep_assert_held(src_ptl);
  2071. vma_assert_locked(src_vma);
  2072. vma_assert_locked(dst_vma);
  2073. /* Sanity checks before the operation */
  2074. if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
  2075. WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
  2076. spin_unlock(src_ptl);
  2077. return -EINVAL;
  2078. }
  2079. if (!pmd_trans_huge(src_pmdval)) {
  2080. spin_unlock(src_ptl);
  2081. if (is_pmd_migration_entry(src_pmdval)) {
  2082. pmd_migration_entry_wait(mm, &src_pmdval);
  2083. return -EAGAIN;
  2084. }
  2085. return -ENOENT;
  2086. }
  2087. src_page = pmd_page(src_pmdval);
  2088. if (!is_huge_zero_pmd(src_pmdval)) {
  2089. if (unlikely(!PageAnonExclusive(src_page))) {
  2090. spin_unlock(src_ptl);
  2091. return -EBUSY;
  2092. }
  2093. src_folio = page_folio(src_page);
  2094. folio_get(src_folio);
  2095. } else
  2096. src_folio = NULL;
  2097. spin_unlock(src_ptl);
  2098. flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
  2099. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
  2100. src_addr + HPAGE_PMD_SIZE);
  2101. mmu_notifier_invalidate_range_start(&range);
  2102. if (src_folio) {
  2103. folio_lock(src_folio);
  2104. /*
  2105. * split_huge_page walks the anon_vma chain without the page
  2106. * lock. Serialize against it with the anon_vma lock, the page
  2107. * lock is not enough.
  2108. */
  2109. src_anon_vma = folio_get_anon_vma(src_folio);
  2110. if (!src_anon_vma) {
  2111. err = -EAGAIN;
  2112. goto unlock_folio;
  2113. }
  2114. anon_vma_lock_write(src_anon_vma);
  2115. } else
  2116. src_anon_vma = NULL;
  2117. dst_ptl = pmd_lockptr(mm, dst_pmd);
  2118. double_pt_lock(src_ptl, dst_ptl);
  2119. if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
  2120. !pmd_same(*dst_pmd, dst_pmdval))) {
  2121. err = -EAGAIN;
  2122. goto unlock_ptls;
  2123. }
  2124. if (src_folio) {
  2125. if (folio_maybe_dma_pinned(src_folio) ||
  2126. !PageAnonExclusive(&src_folio->page)) {
  2127. err = -EBUSY;
  2128. goto unlock_ptls;
  2129. }
  2130. if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
  2131. WARN_ON_ONCE(!folio_test_anon(src_folio))) {
  2132. err = -EBUSY;
  2133. goto unlock_ptls;
  2134. }
  2135. src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
  2136. /* Folio got pinned from under us. Put it back and fail the move. */
  2137. if (folio_maybe_dma_pinned(src_folio)) {
  2138. set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
  2139. err = -EBUSY;
  2140. goto unlock_ptls;
  2141. }
  2142. folio_move_anon_rmap(src_folio, dst_vma);
  2143. src_folio->index = linear_page_index(dst_vma, dst_addr);
  2144. _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
  2145. /* Follow mremap() behavior and treat the entry dirty after the move */
  2146. _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
  2147. } else {
  2148. src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
  2149. _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
  2150. }
  2151. set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
  2152. src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
  2153. pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
  2154. unlock_ptls:
  2155. double_pt_unlock(src_ptl, dst_ptl);
  2156. if (src_anon_vma) {
  2157. anon_vma_unlock_write(src_anon_vma);
  2158. put_anon_vma(src_anon_vma);
  2159. }
  2160. unlock_folio:
  2161. /* unblock rmap walks */
  2162. if (src_folio)
  2163. folio_unlock(src_folio);
  2164. mmu_notifier_invalidate_range_end(&range);
  2165. if (src_folio)
  2166. folio_put(src_folio);
  2167. return err;
  2168. }
  2169. #endif /* CONFIG_USERFAULTFD */
  2170. /*
  2171. * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
  2172. *
  2173. * Note that if it returns page table lock pointer, this routine returns without
  2174. * unlocking page table lock. So callers must unlock it.
  2175. */
  2176. spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
  2177. {
  2178. spinlock_t *ptl;
  2179. ptl = pmd_lock(vma->vm_mm, pmd);
  2180. if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
  2181. pmd_devmap(*pmd)))
  2182. return ptl;
  2183. spin_unlock(ptl);
  2184. return NULL;
  2185. }
  2186. /*
  2187. * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
  2188. *
  2189. * Note that if it returns page table lock pointer, this routine returns without
  2190. * unlocking page table lock. So callers must unlock it.
  2191. */
  2192. spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
  2193. {
  2194. spinlock_t *ptl;
  2195. ptl = pud_lock(vma->vm_mm, pud);
  2196. if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
  2197. return ptl;
  2198. spin_unlock(ptl);
  2199. return NULL;
  2200. }
  2201. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  2202. int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2203. pud_t *pud, unsigned long addr)
  2204. {
  2205. spinlock_t *ptl;
  2206. pud_t orig_pud;
  2207. ptl = __pud_trans_huge_lock(pud, vma);
  2208. if (!ptl)
  2209. return 0;
  2210. orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
  2211. arch_check_zapped_pud(vma, orig_pud);
  2212. tlb_remove_pud_tlb_entry(tlb, pud, addr);
  2213. if (vma_is_special_huge(vma)) {
  2214. spin_unlock(ptl);
  2215. /* No zero page support yet */
  2216. } else {
  2217. /* No support for anonymous PUD pages yet */
  2218. BUG();
  2219. }
  2220. return 1;
  2221. }
  2222. static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
  2223. unsigned long haddr)
  2224. {
  2225. VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
  2226. VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  2227. VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
  2228. VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
  2229. count_vm_event(THP_SPLIT_PUD);
  2230. pudp_huge_clear_flush(vma, haddr, pud);
  2231. }
  2232. void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  2233. unsigned long address)
  2234. {
  2235. spinlock_t *ptl;
  2236. struct mmu_notifier_range range;
  2237. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  2238. address & HPAGE_PUD_MASK,
  2239. (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
  2240. mmu_notifier_invalidate_range_start(&range);
  2241. ptl = pud_lock(vma->vm_mm, pud);
  2242. if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
  2243. goto out;
  2244. __split_huge_pud_locked(vma, pud, range.start);
  2245. out:
  2246. spin_unlock(ptl);
  2247. mmu_notifier_invalidate_range_end(&range);
  2248. }
  2249. #else
  2250. void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  2251. unsigned long address)
  2252. {
  2253. }
  2254. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  2255. static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  2256. unsigned long haddr, pmd_t *pmd)
  2257. {
  2258. struct mm_struct *mm = vma->vm_mm;
  2259. pgtable_t pgtable;
  2260. pmd_t _pmd, old_pmd;
  2261. unsigned long addr;
  2262. pte_t *pte;
  2263. int i;
  2264. /*
  2265. * Leave pmd empty until pte is filled note that it is fine to delay
  2266. * notification until mmu_notifier_invalidate_range_end() as we are
  2267. * replacing a zero pmd write protected page with a zero pte write
  2268. * protected page.
  2269. *
  2270. * See Documentation/mm/mmu_notifier.rst
  2271. */
  2272. old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
  2273. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  2274. pmd_populate(mm, &_pmd, pgtable);
  2275. pte = pte_offset_map(&_pmd, haddr);
  2276. VM_BUG_ON(!pte);
  2277. for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
  2278. pte_t entry;
  2279. entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
  2280. entry = pte_mkspecial(entry);
  2281. if (pmd_uffd_wp(old_pmd))
  2282. entry = pte_mkuffd_wp(entry);
  2283. VM_BUG_ON(!pte_none(ptep_get(pte)));
  2284. set_pte_at(mm, addr, pte, entry);
  2285. pte++;
  2286. }
  2287. pte_unmap(pte - 1);
  2288. smp_wmb(); /* make pte visible before pmd */
  2289. pmd_populate(mm, pmd, pgtable);
  2290. }
  2291. static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
  2292. unsigned long haddr, bool freeze)
  2293. {
  2294. struct mm_struct *mm = vma->vm_mm;
  2295. struct folio *folio;
  2296. struct page *page;
  2297. pgtable_t pgtable;
  2298. pmd_t old_pmd, _pmd;
  2299. bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
  2300. bool anon_exclusive = false, dirty = false;
  2301. unsigned long addr;
  2302. pte_t *pte;
  2303. int i;
  2304. VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  2305. VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  2306. VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
  2307. VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
  2308. && !pmd_devmap(*pmd));
  2309. count_vm_event(THP_SPLIT_PMD);
  2310. if (!vma_is_anonymous(vma)) {
  2311. old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
  2312. /*
  2313. * We are going to unmap this huge page. So
  2314. * just go ahead and zap it
  2315. */
  2316. if (arch_needs_pgtable_deposit())
  2317. zap_deposited_table(mm, pmd);
  2318. if (vma_is_special_huge(vma))
  2319. return;
  2320. if (unlikely(is_pmd_migration_entry(old_pmd))) {
  2321. swp_entry_t entry;
  2322. entry = pmd_to_swp_entry(old_pmd);
  2323. folio = pfn_swap_entry_folio(entry);
  2324. } else {
  2325. page = pmd_page(old_pmd);
  2326. folio = page_folio(page);
  2327. if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
  2328. folio_mark_dirty(folio);
  2329. if (!folio_test_referenced(folio) && pmd_young(old_pmd))
  2330. folio_set_referenced(folio);
  2331. folio_remove_rmap_pmd(folio, page, vma);
  2332. folio_put(folio);
  2333. }
  2334. add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
  2335. return;
  2336. }
  2337. if (is_huge_zero_pmd(*pmd)) {
  2338. /*
  2339. * FIXME: Do we want to invalidate secondary mmu by calling
  2340. * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
  2341. * inside __split_huge_pmd() ?
  2342. *
  2343. * We are going from a zero huge page write protected to zero
  2344. * small page also write protected so it does not seems useful
  2345. * to invalidate secondary mmu at this time.
  2346. */
  2347. return __split_huge_zero_page_pmd(vma, haddr, pmd);
  2348. }
  2349. pmd_migration = is_pmd_migration_entry(*pmd);
  2350. if (unlikely(pmd_migration)) {
  2351. swp_entry_t entry;
  2352. old_pmd = *pmd;
  2353. entry = pmd_to_swp_entry(old_pmd);
  2354. page = pfn_swap_entry_to_page(entry);
  2355. write = is_writable_migration_entry(entry);
  2356. if (PageAnon(page))
  2357. anon_exclusive = is_readable_exclusive_migration_entry(entry);
  2358. young = is_migration_entry_young(entry);
  2359. dirty = is_migration_entry_dirty(entry);
  2360. soft_dirty = pmd_swp_soft_dirty(old_pmd);
  2361. uffd_wp = pmd_swp_uffd_wp(old_pmd);
  2362. } else {
  2363. /*
  2364. * Up to this point the pmd is present and huge and userland has
  2365. * the whole access to the hugepage during the split (which
  2366. * happens in place). If we overwrite the pmd with the not-huge
  2367. * version pointing to the pte here (which of course we could if
  2368. * all CPUs were bug free), userland could trigger a small page
  2369. * size TLB miss on the small sized TLB while the hugepage TLB
  2370. * entry is still established in the huge TLB. Some CPU doesn't
  2371. * like that. See
  2372. * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
  2373. * 383 on page 105. Intel should be safe but is also warns that
  2374. * it's only safe if the permission and cache attributes of the
  2375. * two entries loaded in the two TLB is identical (which should
  2376. * be the case here). But it is generally safer to never allow
  2377. * small and huge TLB entries for the same virtual address to be
  2378. * loaded simultaneously. So instead of doing "pmd_populate();
  2379. * flush_pmd_tlb_range();" we first mark the current pmd
  2380. * notpresent (atomically because here the pmd_trans_huge must
  2381. * remain set at all times on the pmd until the split is
  2382. * complete for this pmd), then we flush the SMP TLB and finally
  2383. * we write the non-huge version of the pmd entry with
  2384. * pmd_populate.
  2385. */
  2386. old_pmd = pmdp_invalidate(vma, haddr, pmd);
  2387. page = pmd_page(old_pmd);
  2388. folio = page_folio(page);
  2389. if (pmd_dirty(old_pmd)) {
  2390. dirty = true;
  2391. folio_set_dirty(folio);
  2392. }
  2393. write = pmd_write(old_pmd);
  2394. young = pmd_young(old_pmd);
  2395. soft_dirty = pmd_soft_dirty(old_pmd);
  2396. uffd_wp = pmd_uffd_wp(old_pmd);
  2397. VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
  2398. VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
  2399. /*
  2400. * Without "freeze", we'll simply split the PMD, propagating the
  2401. * PageAnonExclusive() flag for each PTE by setting it for
  2402. * each subpage -- no need to (temporarily) clear.
  2403. *
  2404. * With "freeze" we want to replace mapped pages by
  2405. * migration entries right away. This is only possible if we
  2406. * managed to clear PageAnonExclusive() -- see
  2407. * set_pmd_migration_entry().
  2408. *
  2409. * In case we cannot clear PageAnonExclusive(), split the PMD
  2410. * only and let try_to_migrate_one() fail later.
  2411. *
  2412. * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
  2413. */
  2414. anon_exclusive = PageAnonExclusive(page);
  2415. if (freeze && anon_exclusive &&
  2416. folio_try_share_anon_rmap_pmd(folio, page))
  2417. freeze = false;
  2418. if (!freeze) {
  2419. rmap_t rmap_flags = RMAP_NONE;
  2420. folio_ref_add(folio, HPAGE_PMD_NR - 1);
  2421. if (anon_exclusive)
  2422. rmap_flags |= RMAP_EXCLUSIVE;
  2423. folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
  2424. vma, haddr, rmap_flags);
  2425. }
  2426. }
  2427. /*
  2428. * Withdraw the table only after we mark the pmd entry invalid.
  2429. * This's critical for some architectures (Power).
  2430. */
  2431. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  2432. pmd_populate(mm, &_pmd, pgtable);
  2433. pte = pte_offset_map(&_pmd, haddr);
  2434. VM_BUG_ON(!pte);
  2435. /*
  2436. * Note that NUMA hinting access restrictions are not transferred to
  2437. * avoid any possibility of altering permissions across VMAs.
  2438. */
  2439. if (freeze || pmd_migration) {
  2440. for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
  2441. pte_t entry;
  2442. swp_entry_t swp_entry;
  2443. if (write)
  2444. swp_entry = make_writable_migration_entry(
  2445. page_to_pfn(page + i));
  2446. else if (anon_exclusive)
  2447. swp_entry = make_readable_exclusive_migration_entry(
  2448. page_to_pfn(page + i));
  2449. else
  2450. swp_entry = make_readable_migration_entry(
  2451. page_to_pfn(page + i));
  2452. if (young)
  2453. swp_entry = make_migration_entry_young(swp_entry);
  2454. if (dirty)
  2455. swp_entry = make_migration_entry_dirty(swp_entry);
  2456. entry = swp_entry_to_pte(swp_entry);
  2457. if (soft_dirty)
  2458. entry = pte_swp_mksoft_dirty(entry);
  2459. if (uffd_wp)
  2460. entry = pte_swp_mkuffd_wp(entry);
  2461. VM_WARN_ON(!pte_none(ptep_get(pte + i)));
  2462. set_pte_at(mm, addr, pte + i, entry);
  2463. }
  2464. } else {
  2465. pte_t entry;
  2466. entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
  2467. if (write)
  2468. entry = pte_mkwrite(entry, vma);
  2469. if (!young)
  2470. entry = pte_mkold(entry);
  2471. /* NOTE: this may set soft-dirty too on some archs */
  2472. if (dirty)
  2473. entry = pte_mkdirty(entry);
  2474. if (soft_dirty)
  2475. entry = pte_mksoft_dirty(entry);
  2476. if (uffd_wp)
  2477. entry = pte_mkuffd_wp(entry);
  2478. for (i = 0; i < HPAGE_PMD_NR; i++)
  2479. VM_WARN_ON(!pte_none(ptep_get(pte + i)));
  2480. set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
  2481. }
  2482. pte_unmap(pte);
  2483. if (!pmd_migration)
  2484. folio_remove_rmap_pmd(folio, page, vma);
  2485. if (freeze)
  2486. put_page(page);
  2487. smp_wmb(); /* make pte visible before pmd */
  2488. pmd_populate(mm, pmd, pgtable);
  2489. }
  2490. void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
  2491. pmd_t *pmd, bool freeze, struct folio *folio)
  2492. {
  2493. VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
  2494. VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
  2495. VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
  2496. VM_BUG_ON(freeze && !folio);
  2497. /*
  2498. * When the caller requests to set up a migration entry, we
  2499. * require a folio to check the PMD against. Otherwise, there
  2500. * is a risk of replacing the wrong folio.
  2501. */
  2502. if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
  2503. is_pmd_migration_entry(*pmd)) {
  2504. if (folio && folio != pmd_folio(*pmd))
  2505. return;
  2506. __split_huge_pmd_locked(vma, pmd, address, freeze);
  2507. }
  2508. }
  2509. void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  2510. unsigned long address, bool freeze, struct folio *folio)
  2511. {
  2512. spinlock_t *ptl;
  2513. struct mmu_notifier_range range;
  2514. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  2515. address & HPAGE_PMD_MASK,
  2516. (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
  2517. mmu_notifier_invalidate_range_start(&range);
  2518. ptl = pmd_lock(vma->vm_mm, pmd);
  2519. split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
  2520. spin_unlock(ptl);
  2521. mmu_notifier_invalidate_range_end(&range);
  2522. }
  2523. void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  2524. bool freeze, struct folio *folio)
  2525. {
  2526. pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
  2527. if (!pmd)
  2528. return;
  2529. __split_huge_pmd(vma, pmd, address, freeze, folio);
  2530. }
  2531. static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
  2532. {
  2533. /*
  2534. * If the new address isn't hpage aligned and it could previously
  2535. * contain an hugepage: check if we need to split an huge pmd.
  2536. */
  2537. if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
  2538. range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
  2539. ALIGN(address, HPAGE_PMD_SIZE)))
  2540. split_huge_pmd_address(vma, address, false, NULL);
  2541. }
  2542. void vma_adjust_trans_huge(struct vm_area_struct *vma,
  2543. unsigned long start,
  2544. unsigned long end,
  2545. long adjust_next)
  2546. {
  2547. /* Check if we need to split start first. */
  2548. split_huge_pmd_if_needed(vma, start);
  2549. /* Check if we need to split end next. */
  2550. split_huge_pmd_if_needed(vma, end);
  2551. /*
  2552. * If we're also updating the next vma vm_start,
  2553. * check if we need to split it.
  2554. */
  2555. if (adjust_next > 0) {
  2556. struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
  2557. unsigned long nstart = next->vm_start;
  2558. nstart += adjust_next;
  2559. split_huge_pmd_if_needed(next, nstart);
  2560. }
  2561. }
  2562. static void unmap_folio(struct folio *folio)
  2563. {
  2564. enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
  2565. TTU_BATCH_FLUSH;
  2566. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  2567. if (folio_test_pmd_mappable(folio))
  2568. ttu_flags |= TTU_SPLIT_HUGE_PMD;
  2569. /*
  2570. * Anon pages need migration entries to preserve them, but file
  2571. * pages can simply be left unmapped, then faulted back on demand.
  2572. * If that is ever changed (perhaps for mlock), update remap_page().
  2573. */
  2574. if (folio_test_anon(folio))
  2575. try_to_migrate(folio, ttu_flags);
  2576. else
  2577. try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
  2578. try_to_unmap_flush();
  2579. }
  2580. static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
  2581. unsigned long addr, pmd_t *pmdp,
  2582. struct folio *folio)
  2583. {
  2584. struct mm_struct *mm = vma->vm_mm;
  2585. int ref_count, map_count;
  2586. pmd_t orig_pmd = *pmdp;
  2587. if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
  2588. return false;
  2589. orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
  2590. /*
  2591. * Syncing against concurrent GUP-fast:
  2592. * - clear PMD; barrier; read refcount
  2593. * - inc refcount; barrier; read PMD
  2594. */
  2595. smp_mb();
  2596. ref_count = folio_ref_count(folio);
  2597. map_count = folio_mapcount(folio);
  2598. /*
  2599. * Order reads for folio refcount and dirty flag
  2600. * (see comments in __remove_mapping()).
  2601. */
  2602. smp_rmb();
  2603. /*
  2604. * If the folio or its PMD is redirtied at this point, or if there
  2605. * are unexpected references, we will give up to discard this folio
  2606. * and remap it.
  2607. *
  2608. * The only folio refs must be one from isolation plus the rmap(s).
  2609. */
  2610. if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
  2611. ref_count != map_count + 1) {
  2612. set_pmd_at(mm, addr, pmdp, orig_pmd);
  2613. return false;
  2614. }
  2615. folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
  2616. zap_deposited_table(mm, pmdp);
  2617. add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  2618. if (vma->vm_flags & VM_LOCKED)
  2619. mlock_drain_local();
  2620. folio_put(folio);
  2621. return true;
  2622. }
  2623. bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
  2624. pmd_t *pmdp, struct folio *folio)
  2625. {
  2626. VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
  2627. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  2628. VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
  2629. if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
  2630. return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
  2631. return false;
  2632. }
  2633. static void remap_page(struct folio *folio, unsigned long nr, int flags)
  2634. {
  2635. int i = 0;
  2636. /* If unmap_folio() uses try_to_migrate() on file, remove this check */
  2637. if (!folio_test_anon(folio))
  2638. return;
  2639. for (;;) {
  2640. remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
  2641. i += folio_nr_pages(folio);
  2642. if (i >= nr)
  2643. break;
  2644. folio = folio_next(folio);
  2645. }
  2646. }
  2647. static void lru_add_page_tail(struct folio *folio, struct page *tail,
  2648. struct lruvec *lruvec, struct list_head *list)
  2649. {
  2650. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  2651. VM_BUG_ON_FOLIO(PageLRU(tail), folio);
  2652. lockdep_assert_held(&lruvec->lru_lock);
  2653. if (list) {
  2654. /* page reclaim is reclaiming a huge page */
  2655. VM_WARN_ON(folio_test_lru(folio));
  2656. get_page(tail);
  2657. list_add_tail(&tail->lru, list);
  2658. } else {
  2659. /* head is still on lru (and we have it frozen) */
  2660. VM_WARN_ON(!folio_test_lru(folio));
  2661. if (folio_test_unevictable(folio))
  2662. tail->mlock_count = 0;
  2663. else
  2664. list_add_tail(&tail->lru, &folio->lru);
  2665. SetPageLRU(tail);
  2666. }
  2667. }
  2668. static void __split_huge_page_tail(struct folio *folio, int tail,
  2669. struct lruvec *lruvec, struct list_head *list,
  2670. unsigned int new_order)
  2671. {
  2672. struct page *head = &folio->page;
  2673. struct page *page_tail = head + tail;
  2674. /*
  2675. * Careful: new_folio is not a "real" folio before we cleared PageTail.
  2676. * Don't pass it around before clear_compound_head().
  2677. */
  2678. struct folio *new_folio = (struct folio *)page_tail;
  2679. VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
  2680. /*
  2681. * Clone page flags before unfreezing refcount.
  2682. *
  2683. * After successful get_page_unless_zero() might follow flags change,
  2684. * for example lock_page() which set PG_waiters.
  2685. *
  2686. * Note that for mapped sub-pages of an anonymous THP,
  2687. * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
  2688. * the migration entry instead from where remap_page() will restore it.
  2689. * We can still have PG_anon_exclusive set on effectively unmapped and
  2690. * unreferenced sub-pages of an anonymous THP: we can simply drop
  2691. * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
  2692. */
  2693. page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  2694. page_tail->flags |= (head->flags &
  2695. ((1L << PG_referenced) |
  2696. (1L << PG_swapbacked) |
  2697. (1L << PG_swapcache) |
  2698. (1L << PG_mlocked) |
  2699. (1L << PG_uptodate) |
  2700. (1L << PG_active) |
  2701. (1L << PG_workingset) |
  2702. (1L << PG_locked) |
  2703. (1L << PG_unevictable) |
  2704. #ifdef CONFIG_ARCH_USES_PG_ARCH_2
  2705. (1L << PG_arch_2) |
  2706. #endif
  2707. #ifdef CONFIG_ARCH_USES_PG_ARCH_3
  2708. (1L << PG_arch_3) |
  2709. #endif
  2710. (1L << PG_dirty) |
  2711. LRU_GEN_MASK | LRU_REFS_MASK));
  2712. /* ->mapping in first and second tail page is replaced by other uses */
  2713. VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  2714. page_tail);
  2715. page_tail->mapping = head->mapping;
  2716. page_tail->index = head->index + tail;
  2717. /*
  2718. * page->private should not be set in tail pages. Fix up and warn once
  2719. * if private is unexpectedly set.
  2720. */
  2721. if (unlikely(page_tail->private)) {
  2722. VM_WARN_ON_ONCE_PAGE(true, page_tail);
  2723. page_tail->private = 0;
  2724. }
  2725. if (folio_test_swapcache(folio))
  2726. new_folio->swap.val = folio->swap.val + tail;
  2727. /* Page flags must be visible before we make the page non-compound. */
  2728. smp_wmb();
  2729. /*
  2730. * Clear PageTail before unfreezing page refcount.
  2731. *
  2732. * After successful get_page_unless_zero() might follow put_page()
  2733. * which needs correct compound_head().
  2734. */
  2735. clear_compound_head(page_tail);
  2736. if (new_order) {
  2737. prep_compound_page(page_tail, new_order);
  2738. folio_set_large_rmappable(new_folio);
  2739. }
  2740. /* Finally unfreeze refcount. Additional reference from page cache. */
  2741. page_ref_unfreeze(page_tail,
  2742. 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
  2743. folio_nr_pages(new_folio) : 0));
  2744. if (folio_test_young(folio))
  2745. folio_set_young(new_folio);
  2746. if (folio_test_idle(folio))
  2747. folio_set_idle(new_folio);
  2748. folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
  2749. /*
  2750. * always add to the tail because some iterators expect new
  2751. * pages to show after the currently processed elements - e.g.
  2752. * migrate_pages
  2753. */
  2754. lru_add_page_tail(folio, page_tail, lruvec, list);
  2755. }
  2756. static void __split_huge_page(struct page *page, struct list_head *list,
  2757. pgoff_t end, unsigned int new_order)
  2758. {
  2759. struct folio *folio = page_folio(page);
  2760. struct page *head = &folio->page;
  2761. struct lruvec *lruvec;
  2762. struct address_space *swap_cache = NULL;
  2763. unsigned long offset = 0;
  2764. int i, nr_dropped = 0;
  2765. unsigned int new_nr = 1 << new_order;
  2766. int order = folio_order(folio);
  2767. unsigned int nr = 1 << order;
  2768. /* complete memcg works before add pages to LRU */
  2769. split_page_memcg(head, order, new_order);
  2770. if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
  2771. offset = swap_cache_index(folio->swap);
  2772. swap_cache = swap_address_space(folio->swap);
  2773. xa_lock(&swap_cache->i_pages);
  2774. }
  2775. /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
  2776. lruvec = folio_lruvec_lock(folio);
  2777. ClearPageHasHWPoisoned(head);
  2778. for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
  2779. __split_huge_page_tail(folio, i, lruvec, list, new_order);
  2780. /* Some pages can be beyond EOF: drop them from page cache */
  2781. if (head[i].index >= end) {
  2782. struct folio *tail = page_folio(head + i);
  2783. if (shmem_mapping(folio->mapping))
  2784. nr_dropped++;
  2785. else if (folio_test_clear_dirty(tail))
  2786. folio_account_cleaned(tail,
  2787. inode_to_wb(folio->mapping->host));
  2788. __filemap_remove_folio(tail, NULL);
  2789. folio_put(tail);
  2790. } else if (!PageAnon(page)) {
  2791. __xa_store(&folio->mapping->i_pages, head[i].index,
  2792. head + i, 0);
  2793. } else if (swap_cache) {
  2794. __xa_store(&swap_cache->i_pages, offset + i,
  2795. head + i, 0);
  2796. }
  2797. }
  2798. if (!new_order)
  2799. ClearPageCompound(head);
  2800. else {
  2801. struct folio *new_folio = (struct folio *)head;
  2802. folio_set_order(new_folio, new_order);
  2803. }
  2804. unlock_page_lruvec(lruvec);
  2805. /* Caller disabled irqs, so they are still disabled here */
  2806. split_page_owner(head, order, new_order);
  2807. pgalloc_tag_split(folio, order, new_order);
  2808. /* See comment in __split_huge_page_tail() */
  2809. if (folio_test_anon(folio)) {
  2810. /* Additional pin to swap cache */
  2811. if (folio_test_swapcache(folio)) {
  2812. folio_ref_add(folio, 1 + new_nr);
  2813. xa_unlock(&swap_cache->i_pages);
  2814. } else {
  2815. folio_ref_inc(folio);
  2816. }
  2817. } else {
  2818. /* Additional pin to page cache */
  2819. folio_ref_add(folio, 1 + new_nr);
  2820. xa_unlock(&folio->mapping->i_pages);
  2821. }
  2822. local_irq_enable();
  2823. if (nr_dropped)
  2824. shmem_uncharge(folio->mapping->host, nr_dropped);
  2825. remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
  2826. /*
  2827. * set page to its compound_head when split to non order-0 pages, so
  2828. * we can skip unlocking it below, since PG_locked is transferred to
  2829. * the compound_head of the page and the caller will unlock it.
  2830. */
  2831. if (new_order)
  2832. page = compound_head(page);
  2833. for (i = 0; i < nr; i += new_nr) {
  2834. struct page *subpage = head + i;
  2835. struct folio *new_folio = page_folio(subpage);
  2836. if (subpage == page)
  2837. continue;
  2838. folio_unlock(new_folio);
  2839. /*
  2840. * Subpages may be freed if there wasn't any mapping
  2841. * like if add_to_swap() is running on a lru page that
  2842. * had its mapping zapped. And freeing these pages
  2843. * requires taking the lru_lock so we do the put_page
  2844. * of the tail pages after the split is complete.
  2845. */
  2846. free_page_and_swap_cache(subpage);
  2847. }
  2848. }
  2849. /* Racy check whether the huge page can be split */
  2850. bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
  2851. {
  2852. int extra_pins;
  2853. /* Additional pins from page cache */
  2854. if (folio_test_anon(folio))
  2855. extra_pins = folio_test_swapcache(folio) ?
  2856. folio_nr_pages(folio) : 0;
  2857. else
  2858. extra_pins = folio_nr_pages(folio);
  2859. if (pextra_pins)
  2860. *pextra_pins = extra_pins;
  2861. return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
  2862. caller_pins;
  2863. }
  2864. /*
  2865. * This function splits a large folio into smaller folios of order @new_order.
  2866. * @page can point to any page of the large folio to split. The split operation
  2867. * does not change the position of @page.
  2868. *
  2869. * Prerequisites:
  2870. *
  2871. * 1) The caller must hold a reference on the @page's owning folio, also known
  2872. * as the large folio.
  2873. *
  2874. * 2) The large folio must be locked.
  2875. *
  2876. * 3) The folio must not be pinned. Any unexpected folio references, including
  2877. * GUP pins, will result in the folio not getting split; instead, the caller
  2878. * will receive an -EAGAIN.
  2879. *
  2880. * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
  2881. * supported for non-file-backed folios, because folio->_deferred_list, which
  2882. * is used by partially mapped folios, is stored in subpage 2, but an order-1
  2883. * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
  2884. * since they do not use _deferred_list.
  2885. *
  2886. * After splitting, the caller's folio reference will be transferred to @page,
  2887. * resulting in a raised refcount of @page after this call. The other pages may
  2888. * be freed if they are not mapped.
  2889. *
  2890. * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
  2891. *
  2892. * Pages in @new_order will inherit the mapping, flags, and so on from the
  2893. * huge page.
  2894. *
  2895. * Returns 0 if the huge page was split successfully.
  2896. *
  2897. * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
  2898. * the folio was concurrently removed from the page cache.
  2899. *
  2900. * Returns -EBUSY when trying to split the huge zeropage, if the folio is
  2901. * under writeback, if fs-specific folio metadata cannot currently be
  2902. * released, or if some unexpected race happened (e.g., anon VMA disappeared,
  2903. * truncation).
  2904. *
  2905. * Callers should ensure that the order respects the address space mapping
  2906. * min-order if one is set for non-anonymous folios.
  2907. *
  2908. * Returns -EINVAL when trying to split to an order that is incompatible
  2909. * with the folio. Splitting to order 0 is compatible with all folios.
  2910. */
  2911. int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
  2912. unsigned int new_order)
  2913. {
  2914. struct folio *folio = page_folio(page);
  2915. struct deferred_split *ds_queue = get_deferred_split_queue(folio);
  2916. /* reset xarray order to new order after split */
  2917. XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
  2918. bool is_anon = folio_test_anon(folio);
  2919. struct address_space *mapping = NULL;
  2920. struct anon_vma *anon_vma = NULL;
  2921. int order = folio_order(folio);
  2922. int extra_pins, ret;
  2923. pgoff_t end;
  2924. bool is_hzp;
  2925. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  2926. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  2927. if (new_order >= folio_order(folio))
  2928. return -EINVAL;
  2929. if (is_anon) {
  2930. /* order-1 is not supported for anonymous THP. */
  2931. if (new_order == 1) {
  2932. VM_WARN_ONCE(1, "Cannot split to order-1 folio");
  2933. return -EINVAL;
  2934. }
  2935. } else if (new_order) {
  2936. /* Split shmem folio to non-zero order not supported */
  2937. if (shmem_mapping(folio->mapping)) {
  2938. VM_WARN_ONCE(1,
  2939. "Cannot split shmem folio to non-0 order");
  2940. return -EINVAL;
  2941. }
  2942. /*
  2943. * No split if the file system does not support large folio.
  2944. * Note that we might still have THPs in such mappings due to
  2945. * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
  2946. * does not actually support large folios properly.
  2947. */
  2948. if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
  2949. !mapping_large_folio_support(folio->mapping)) {
  2950. VM_WARN_ONCE(1,
  2951. "Cannot split file folio to non-0 order");
  2952. return -EINVAL;
  2953. }
  2954. }
  2955. /* Only swapping a whole PMD-mapped folio is supported */
  2956. if (folio_test_swapcache(folio) && new_order)
  2957. return -EINVAL;
  2958. is_hzp = is_huge_zero_folio(folio);
  2959. if (is_hzp) {
  2960. pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
  2961. return -EBUSY;
  2962. }
  2963. if (folio_test_writeback(folio))
  2964. return -EBUSY;
  2965. if (is_anon) {
  2966. /*
  2967. * The caller does not necessarily hold an mmap_lock that would
  2968. * prevent the anon_vma disappearing so we first we take a
  2969. * reference to it and then lock the anon_vma for write. This
  2970. * is similar to folio_lock_anon_vma_read except the write lock
  2971. * is taken to serialise against parallel split or collapse
  2972. * operations.
  2973. */
  2974. anon_vma = folio_get_anon_vma(folio);
  2975. if (!anon_vma) {
  2976. ret = -EBUSY;
  2977. goto out;
  2978. }
  2979. end = -1;
  2980. mapping = NULL;
  2981. anon_vma_lock_write(anon_vma);
  2982. } else {
  2983. unsigned int min_order;
  2984. gfp_t gfp;
  2985. mapping = folio->mapping;
  2986. /* Truncated ? */
  2987. if (!mapping) {
  2988. ret = -EBUSY;
  2989. goto out;
  2990. }
  2991. min_order = mapping_min_folio_order(folio->mapping);
  2992. if (new_order < min_order) {
  2993. VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
  2994. min_order);
  2995. ret = -EINVAL;
  2996. goto out;
  2997. }
  2998. gfp = current_gfp_context(mapping_gfp_mask(mapping) &
  2999. GFP_RECLAIM_MASK);
  3000. if (!filemap_release_folio(folio, gfp)) {
  3001. ret = -EBUSY;
  3002. goto out;
  3003. }
  3004. xas_split_alloc(&xas, folio, folio_order(folio), gfp);
  3005. if (xas_error(&xas)) {
  3006. ret = xas_error(&xas);
  3007. goto out;
  3008. }
  3009. anon_vma = NULL;
  3010. i_mmap_lock_read(mapping);
  3011. /*
  3012. *__split_huge_page() may need to trim off pages beyond EOF:
  3013. * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
  3014. * which cannot be nested inside the page tree lock. So note
  3015. * end now: i_size itself may be changed at any moment, but
  3016. * folio lock is good enough to serialize the trimming.
  3017. */
  3018. end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
  3019. if (shmem_mapping(mapping))
  3020. end = shmem_fallocend(mapping->host, end);
  3021. }
  3022. /*
  3023. * Racy check if we can split the page, before unmap_folio() will
  3024. * split PMDs
  3025. */
  3026. if (!can_split_folio(folio, 1, &extra_pins)) {
  3027. ret = -EAGAIN;
  3028. goto out_unlock;
  3029. }
  3030. unmap_folio(folio);
  3031. /* block interrupt reentry in xa_lock and spinlock */
  3032. local_irq_disable();
  3033. if (mapping) {
  3034. /*
  3035. * Check if the folio is present in page cache.
  3036. * We assume all tail are present too, if folio is there.
  3037. */
  3038. xas_lock(&xas);
  3039. xas_reset(&xas);
  3040. if (xas_load(&xas) != folio)
  3041. goto fail;
  3042. }
  3043. /* Prevent deferred_split_scan() touching ->_refcount */
  3044. spin_lock(&ds_queue->split_queue_lock);
  3045. if (folio_ref_freeze(folio, 1 + extra_pins)) {
  3046. if (folio_order(folio) > 1 &&
  3047. !list_empty(&folio->_deferred_list)) {
  3048. ds_queue->split_queue_len--;
  3049. if (folio_test_partially_mapped(folio)) {
  3050. folio_clear_partially_mapped(folio);
  3051. mod_mthp_stat(folio_order(folio),
  3052. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3053. }
  3054. /*
  3055. * Reinitialize page_deferred_list after removing the
  3056. * page from the split_queue, otherwise a subsequent
  3057. * split will see list corruption when checking the
  3058. * page_deferred_list.
  3059. */
  3060. list_del_init(&folio->_deferred_list);
  3061. }
  3062. spin_unlock(&ds_queue->split_queue_lock);
  3063. if (mapping) {
  3064. int nr = folio_nr_pages(folio);
  3065. xas_split(&xas, folio, folio_order(folio));
  3066. if (folio_test_pmd_mappable(folio) &&
  3067. new_order < HPAGE_PMD_ORDER) {
  3068. if (folio_test_swapbacked(folio)) {
  3069. __lruvec_stat_mod_folio(folio,
  3070. NR_SHMEM_THPS, -nr);
  3071. } else {
  3072. __lruvec_stat_mod_folio(folio,
  3073. NR_FILE_THPS, -nr);
  3074. filemap_nr_thps_dec(mapping);
  3075. }
  3076. }
  3077. }
  3078. if (is_anon) {
  3079. mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
  3080. mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
  3081. }
  3082. __split_huge_page(page, list, end, new_order);
  3083. ret = 0;
  3084. } else {
  3085. spin_unlock(&ds_queue->split_queue_lock);
  3086. fail:
  3087. if (mapping)
  3088. xas_unlock(&xas);
  3089. local_irq_enable();
  3090. remap_page(folio, folio_nr_pages(folio), 0);
  3091. ret = -EAGAIN;
  3092. }
  3093. out_unlock:
  3094. if (anon_vma) {
  3095. anon_vma_unlock_write(anon_vma);
  3096. put_anon_vma(anon_vma);
  3097. }
  3098. if (mapping)
  3099. i_mmap_unlock_read(mapping);
  3100. out:
  3101. xas_destroy(&xas);
  3102. if (order == HPAGE_PMD_ORDER)
  3103. count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  3104. count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
  3105. return ret;
  3106. }
  3107. int min_order_for_split(struct folio *folio)
  3108. {
  3109. if (folio_test_anon(folio))
  3110. return 0;
  3111. if (!folio->mapping) {
  3112. if (folio_test_pmd_mappable(folio))
  3113. count_vm_event(THP_SPLIT_PAGE_FAILED);
  3114. return -EBUSY;
  3115. }
  3116. return mapping_min_folio_order(folio->mapping);
  3117. }
  3118. int split_folio_to_list(struct folio *folio, struct list_head *list)
  3119. {
  3120. int ret = min_order_for_split(folio);
  3121. if (ret < 0)
  3122. return ret;
  3123. return split_huge_page_to_list_to_order(&folio->page, list, ret);
  3124. }
  3125. /*
  3126. * __folio_unqueue_deferred_split() is not to be called directly:
  3127. * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
  3128. * limits its calls to those folios which may have a _deferred_list for
  3129. * queueing THP splits, and that list is (racily observed to be) non-empty.
  3130. *
  3131. * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
  3132. * zero: because even when split_queue_lock is held, a non-empty _deferred_list
  3133. * might be in use on deferred_split_scan()'s unlocked on-stack list.
  3134. *
  3135. * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
  3136. * therefore important to unqueue deferred split before changing folio memcg.
  3137. */
  3138. bool __folio_unqueue_deferred_split(struct folio *folio)
  3139. {
  3140. struct deferred_split *ds_queue;
  3141. unsigned long flags;
  3142. bool unqueued = false;
  3143. WARN_ON_ONCE(folio_ref_count(folio));
  3144. WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
  3145. ds_queue = get_deferred_split_queue(folio);
  3146. spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  3147. if (!list_empty(&folio->_deferred_list)) {
  3148. ds_queue->split_queue_len--;
  3149. if (folio_test_partially_mapped(folio)) {
  3150. folio_clear_partially_mapped(folio);
  3151. mod_mthp_stat(folio_order(folio),
  3152. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3153. }
  3154. list_del_init(&folio->_deferred_list);
  3155. unqueued = true;
  3156. }
  3157. spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  3158. return unqueued; /* useful for debug warnings */
  3159. }
  3160. /* partially_mapped=false won't clear PG_partially_mapped folio flag */
  3161. void deferred_split_folio(struct folio *folio, bool partially_mapped)
  3162. {
  3163. struct deferred_split *ds_queue = get_deferred_split_queue(folio);
  3164. #ifdef CONFIG_MEMCG
  3165. struct mem_cgroup *memcg = folio_memcg(folio);
  3166. #endif
  3167. unsigned long flags;
  3168. /*
  3169. * Order 1 folios have no space for a deferred list, but we also
  3170. * won't waste much memory by not adding them to the deferred list.
  3171. */
  3172. if (folio_order(folio) <= 1)
  3173. return;
  3174. if (!partially_mapped && !split_underused_thp)
  3175. return;
  3176. /*
  3177. * Exclude swapcache: originally to avoid a corrupt deferred split
  3178. * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
  3179. * but if page reclaim is already handling the same folio, it is
  3180. * unnecessary to handle it again in the shrinker, so excluding
  3181. * swapcache here may still be a useful optimization.
  3182. */
  3183. if (folio_test_swapcache(folio))
  3184. return;
  3185. spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  3186. if (partially_mapped) {
  3187. if (!folio_test_partially_mapped(folio)) {
  3188. folio_set_partially_mapped(folio);
  3189. if (folio_test_pmd_mappable(folio))
  3190. count_vm_event(THP_DEFERRED_SPLIT_PAGE);
  3191. count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
  3192. mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
  3193. }
  3194. } else {
  3195. /* partially mapped folios cannot become non-partially mapped */
  3196. VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
  3197. }
  3198. if (list_empty(&folio->_deferred_list)) {
  3199. list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
  3200. ds_queue->split_queue_len++;
  3201. #ifdef CONFIG_MEMCG
  3202. if (memcg)
  3203. set_shrinker_bit(memcg, folio_nid(folio),
  3204. deferred_split_shrinker->id);
  3205. #endif
  3206. }
  3207. spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  3208. }
  3209. static unsigned long deferred_split_count(struct shrinker *shrink,
  3210. struct shrink_control *sc)
  3211. {
  3212. struct pglist_data *pgdata = NODE_DATA(sc->nid);
  3213. struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
  3214. #ifdef CONFIG_MEMCG
  3215. if (sc->memcg)
  3216. ds_queue = &sc->memcg->deferred_split_queue;
  3217. #endif
  3218. return READ_ONCE(ds_queue->split_queue_len);
  3219. }
  3220. static bool thp_underused(struct folio *folio)
  3221. {
  3222. int num_zero_pages = 0, num_filled_pages = 0;
  3223. void *kaddr;
  3224. int i;
  3225. if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
  3226. return false;
  3227. for (i = 0; i < folio_nr_pages(folio); i++) {
  3228. kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
  3229. if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
  3230. num_zero_pages++;
  3231. if (num_zero_pages > khugepaged_max_ptes_none) {
  3232. kunmap_local(kaddr);
  3233. return true;
  3234. }
  3235. } else {
  3236. /*
  3237. * Another path for early exit once the number
  3238. * of non-zero filled pages exceeds threshold.
  3239. */
  3240. num_filled_pages++;
  3241. if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
  3242. kunmap_local(kaddr);
  3243. return false;
  3244. }
  3245. }
  3246. kunmap_local(kaddr);
  3247. }
  3248. return false;
  3249. }
  3250. static unsigned long deferred_split_scan(struct shrinker *shrink,
  3251. struct shrink_control *sc)
  3252. {
  3253. struct pglist_data *pgdata = NODE_DATA(sc->nid);
  3254. struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
  3255. unsigned long flags;
  3256. LIST_HEAD(list);
  3257. struct folio *folio, *next, *prev = NULL;
  3258. int split = 0, removed = 0;
  3259. #ifdef CONFIG_MEMCG
  3260. if (sc->memcg)
  3261. ds_queue = &sc->memcg->deferred_split_queue;
  3262. #endif
  3263. spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  3264. /* Take pin on all head pages to avoid freeing them under us */
  3265. list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
  3266. _deferred_list) {
  3267. if (folio_try_get(folio)) {
  3268. list_move(&folio->_deferred_list, &list);
  3269. } else {
  3270. /* We lost race with folio_put() */
  3271. if (folio_test_partially_mapped(folio)) {
  3272. folio_clear_partially_mapped(folio);
  3273. mod_mthp_stat(folio_order(folio),
  3274. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3275. }
  3276. list_del_init(&folio->_deferred_list);
  3277. ds_queue->split_queue_len--;
  3278. }
  3279. if (!--sc->nr_to_scan)
  3280. break;
  3281. }
  3282. spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  3283. list_for_each_entry_safe(folio, next, &list, _deferred_list) {
  3284. bool did_split = false;
  3285. bool underused = false;
  3286. if (!folio_test_partially_mapped(folio)) {
  3287. underused = thp_underused(folio);
  3288. if (!underused)
  3289. goto next;
  3290. }
  3291. if (!folio_trylock(folio))
  3292. goto next;
  3293. if (!split_folio(folio)) {
  3294. did_split = true;
  3295. if (underused)
  3296. count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
  3297. split++;
  3298. }
  3299. folio_unlock(folio);
  3300. next:
  3301. /*
  3302. * split_folio() removes folio from list on success.
  3303. * Only add back to the queue if folio is partially mapped.
  3304. * If thp_underused returns false, or if split_folio fails
  3305. * in the case it was underused, then consider it used and
  3306. * don't add it back to split_queue.
  3307. */
  3308. if (did_split) {
  3309. ; /* folio already removed from list */
  3310. } else if (!folio_test_partially_mapped(folio)) {
  3311. list_del_init(&folio->_deferred_list);
  3312. removed++;
  3313. } else {
  3314. /*
  3315. * That unlocked list_del_init() above would be unsafe,
  3316. * unless its folio is separated from any earlier folios
  3317. * left on the list (which may be concurrently unqueued)
  3318. * by one safe folio with refcount still raised.
  3319. */
  3320. swap(folio, prev);
  3321. }
  3322. if (folio)
  3323. folio_put(folio);
  3324. }
  3325. spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  3326. list_splice_tail(&list, &ds_queue->split_queue);
  3327. ds_queue->split_queue_len -= removed;
  3328. spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  3329. if (prev)
  3330. folio_put(prev);
  3331. /*
  3332. * Stop shrinker if we didn't split any page, but the queue is empty.
  3333. * This can happen if pages were freed under us.
  3334. */
  3335. if (!split && list_empty(&ds_queue->split_queue))
  3336. return SHRINK_STOP;
  3337. return split;
  3338. }
  3339. #ifdef CONFIG_DEBUG_FS
  3340. static void split_huge_pages_all(void)
  3341. {
  3342. struct zone *zone;
  3343. struct page *page;
  3344. struct folio *folio;
  3345. unsigned long pfn, max_zone_pfn;
  3346. unsigned long total = 0, split = 0;
  3347. pr_debug("Split all THPs\n");
  3348. for_each_zone(zone) {
  3349. if (!managed_zone(zone))
  3350. continue;
  3351. max_zone_pfn = zone_end_pfn(zone);
  3352. for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  3353. int nr_pages;
  3354. page = pfn_to_online_page(pfn);
  3355. if (!page || PageTail(page))
  3356. continue;
  3357. folio = page_folio(page);
  3358. if (!folio_try_get(folio))
  3359. continue;
  3360. if (unlikely(page_folio(page) != folio))
  3361. goto next;
  3362. if (zone != folio_zone(folio))
  3363. goto next;
  3364. if (!folio_test_large(folio)
  3365. || folio_test_hugetlb(folio)
  3366. || !folio_test_lru(folio))
  3367. goto next;
  3368. total++;
  3369. folio_lock(folio);
  3370. nr_pages = folio_nr_pages(folio);
  3371. if (!split_folio(folio))
  3372. split++;
  3373. pfn += nr_pages - 1;
  3374. folio_unlock(folio);
  3375. next:
  3376. folio_put(folio);
  3377. cond_resched();
  3378. }
  3379. }
  3380. pr_debug("%lu of %lu THP split\n", split, total);
  3381. }
  3382. static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
  3383. {
  3384. return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
  3385. is_vm_hugetlb_page(vma);
  3386. }
  3387. static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
  3388. unsigned long vaddr_end, unsigned int new_order)
  3389. {
  3390. int ret = 0;
  3391. struct task_struct *task;
  3392. struct mm_struct *mm;
  3393. unsigned long total = 0, split = 0;
  3394. unsigned long addr;
  3395. vaddr_start &= PAGE_MASK;
  3396. vaddr_end &= PAGE_MASK;
  3397. task = find_get_task_by_vpid(pid);
  3398. if (!task) {
  3399. ret = -ESRCH;
  3400. goto out;
  3401. }
  3402. /* Find the mm_struct */
  3403. mm = get_task_mm(task);
  3404. put_task_struct(task);
  3405. if (!mm) {
  3406. ret = -EINVAL;
  3407. goto out;
  3408. }
  3409. pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
  3410. pid, vaddr_start, vaddr_end);
  3411. mmap_read_lock(mm);
  3412. /*
  3413. * always increase addr by PAGE_SIZE, since we could have a PTE page
  3414. * table filled with PTE-mapped THPs, each of which is distinct.
  3415. */
  3416. for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
  3417. struct vm_area_struct *vma = vma_lookup(mm, addr);
  3418. struct folio_walk fw;
  3419. struct folio *folio;
  3420. struct address_space *mapping;
  3421. unsigned int target_order = new_order;
  3422. if (!vma)
  3423. break;
  3424. /* skip special VMA and hugetlb VMA */
  3425. if (vma_not_suitable_for_thp_split(vma)) {
  3426. addr = vma->vm_end;
  3427. continue;
  3428. }
  3429. folio = folio_walk_start(&fw, vma, addr, 0);
  3430. if (!folio)
  3431. continue;
  3432. if (!is_transparent_hugepage(folio))
  3433. goto next;
  3434. if (!folio_test_anon(folio)) {
  3435. mapping = folio->mapping;
  3436. target_order = max(new_order,
  3437. mapping_min_folio_order(mapping));
  3438. }
  3439. if (target_order >= folio_order(folio))
  3440. goto next;
  3441. total++;
  3442. /*
  3443. * For folios with private, split_huge_page_to_list_to_order()
  3444. * will try to drop it before split and then check if the folio
  3445. * can be split or not. So skip the check here.
  3446. */
  3447. if (!folio_test_private(folio) &&
  3448. !can_split_folio(folio, 0, NULL))
  3449. goto next;
  3450. if (!folio_trylock(folio))
  3451. goto next;
  3452. folio_get(folio);
  3453. folio_walk_end(&fw, vma);
  3454. if (!folio_test_anon(folio) && folio->mapping != mapping)
  3455. goto unlock;
  3456. if (!split_folio_to_order(folio, target_order))
  3457. split++;
  3458. unlock:
  3459. folio_unlock(folio);
  3460. folio_put(folio);
  3461. cond_resched();
  3462. continue;
  3463. next:
  3464. folio_walk_end(&fw, vma);
  3465. cond_resched();
  3466. }
  3467. mmap_read_unlock(mm);
  3468. mmput(mm);
  3469. pr_debug("%lu of %lu THP split\n", split, total);
  3470. out:
  3471. return ret;
  3472. }
  3473. static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
  3474. pgoff_t off_end, unsigned int new_order)
  3475. {
  3476. struct filename *file;
  3477. struct file *candidate;
  3478. struct address_space *mapping;
  3479. int ret = -EINVAL;
  3480. pgoff_t index;
  3481. int nr_pages = 1;
  3482. unsigned long total = 0, split = 0;
  3483. unsigned int min_order;
  3484. unsigned int target_order;
  3485. file = getname_kernel(file_path);
  3486. if (IS_ERR(file))
  3487. return ret;
  3488. candidate = file_open_name(file, O_RDONLY, 0);
  3489. if (IS_ERR(candidate))
  3490. goto out;
  3491. pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
  3492. file_path, off_start, off_end);
  3493. mapping = candidate->f_mapping;
  3494. min_order = mapping_min_folio_order(mapping);
  3495. target_order = max(new_order, min_order);
  3496. for (index = off_start; index < off_end; index += nr_pages) {
  3497. struct folio *folio = filemap_get_folio(mapping, index);
  3498. nr_pages = 1;
  3499. if (IS_ERR(folio))
  3500. continue;
  3501. if (!folio_test_large(folio))
  3502. goto next;
  3503. total++;
  3504. nr_pages = folio_nr_pages(folio);
  3505. if (target_order >= folio_order(folio))
  3506. goto next;
  3507. if (!folio_trylock(folio))
  3508. goto next;
  3509. if (folio->mapping != mapping)
  3510. goto unlock;
  3511. if (!split_folio_to_order(folio, target_order))
  3512. split++;
  3513. unlock:
  3514. folio_unlock(folio);
  3515. next:
  3516. folio_put(folio);
  3517. cond_resched();
  3518. }
  3519. filp_close(candidate, NULL);
  3520. ret = 0;
  3521. pr_debug("%lu of %lu file-backed THP split\n", split, total);
  3522. out:
  3523. putname(file);
  3524. return ret;
  3525. }
  3526. #define MAX_INPUT_BUF_SZ 255
  3527. static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
  3528. size_t count, loff_t *ppops)
  3529. {
  3530. static DEFINE_MUTEX(split_debug_mutex);
  3531. ssize_t ret;
  3532. /*
  3533. * hold pid, start_vaddr, end_vaddr, new_order or
  3534. * file_path, off_start, off_end, new_order
  3535. */
  3536. char input_buf[MAX_INPUT_BUF_SZ];
  3537. int pid;
  3538. unsigned long vaddr_start, vaddr_end;
  3539. unsigned int new_order = 0;
  3540. ret = mutex_lock_interruptible(&split_debug_mutex);
  3541. if (ret)
  3542. return ret;
  3543. ret = -EFAULT;
  3544. memset(input_buf, 0, MAX_INPUT_BUF_SZ);
  3545. if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
  3546. goto out;
  3547. input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
  3548. if (input_buf[0] == '/') {
  3549. char *tok;
  3550. char *buf = input_buf;
  3551. char file_path[MAX_INPUT_BUF_SZ];
  3552. pgoff_t off_start = 0, off_end = 0;
  3553. size_t input_len = strlen(input_buf);
  3554. tok = strsep(&buf, ",");
  3555. if (tok) {
  3556. strcpy(file_path, tok);
  3557. } else {
  3558. ret = -EINVAL;
  3559. goto out;
  3560. }
  3561. ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
  3562. if (ret != 2 && ret != 3) {
  3563. ret = -EINVAL;
  3564. goto out;
  3565. }
  3566. ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
  3567. if (!ret)
  3568. ret = input_len;
  3569. goto out;
  3570. }
  3571. ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
  3572. if (ret == 1 && pid == 1) {
  3573. split_huge_pages_all();
  3574. ret = strlen(input_buf);
  3575. goto out;
  3576. } else if (ret != 3 && ret != 4) {
  3577. ret = -EINVAL;
  3578. goto out;
  3579. }
  3580. ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
  3581. if (!ret)
  3582. ret = strlen(input_buf);
  3583. out:
  3584. mutex_unlock(&split_debug_mutex);
  3585. return ret;
  3586. }
  3587. static const struct file_operations split_huge_pages_fops = {
  3588. .owner = THIS_MODULE,
  3589. .write = split_huge_pages_write,
  3590. };
  3591. static int __init split_huge_pages_debugfs(void)
  3592. {
  3593. debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
  3594. &split_huge_pages_fops);
  3595. return 0;
  3596. }
  3597. late_initcall(split_huge_pages_debugfs);
  3598. #endif
  3599. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  3600. int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
  3601. struct page *page)
  3602. {
  3603. struct folio *folio = page_folio(page);
  3604. struct vm_area_struct *vma = pvmw->vma;
  3605. struct mm_struct *mm = vma->vm_mm;
  3606. unsigned long address = pvmw->address;
  3607. bool anon_exclusive;
  3608. pmd_t pmdval;
  3609. swp_entry_t entry;
  3610. pmd_t pmdswp;
  3611. if (!(pvmw->pmd && !pvmw->pte))
  3612. return 0;
  3613. flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
  3614. pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
  3615. /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
  3616. anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
  3617. if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
  3618. set_pmd_at(mm, address, pvmw->pmd, pmdval);
  3619. return -EBUSY;
  3620. }
  3621. if (pmd_dirty(pmdval))
  3622. folio_mark_dirty(folio);
  3623. if (pmd_write(pmdval))
  3624. entry = make_writable_migration_entry(page_to_pfn(page));
  3625. else if (anon_exclusive)
  3626. entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
  3627. else
  3628. entry = make_readable_migration_entry(page_to_pfn(page));
  3629. if (pmd_young(pmdval))
  3630. entry = make_migration_entry_young(entry);
  3631. if (pmd_dirty(pmdval))
  3632. entry = make_migration_entry_dirty(entry);
  3633. pmdswp = swp_entry_to_pmd(entry);
  3634. if (pmd_soft_dirty(pmdval))
  3635. pmdswp = pmd_swp_mksoft_dirty(pmdswp);
  3636. if (pmd_uffd_wp(pmdval))
  3637. pmdswp = pmd_swp_mkuffd_wp(pmdswp);
  3638. set_pmd_at(mm, address, pvmw->pmd, pmdswp);
  3639. folio_remove_rmap_pmd(folio, page, vma);
  3640. folio_put(folio);
  3641. trace_set_migration_pmd(address, pmd_val(pmdswp));
  3642. return 0;
  3643. }
  3644. void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
  3645. {
  3646. struct folio *folio = page_folio(new);
  3647. struct vm_area_struct *vma = pvmw->vma;
  3648. struct mm_struct *mm = vma->vm_mm;
  3649. unsigned long address = pvmw->address;
  3650. unsigned long haddr = address & HPAGE_PMD_MASK;
  3651. pmd_t pmde;
  3652. swp_entry_t entry;
  3653. if (!(pvmw->pmd && !pvmw->pte))
  3654. return;
  3655. entry = pmd_to_swp_entry(*pvmw->pmd);
  3656. folio_get(folio);
  3657. pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
  3658. if (pmd_swp_soft_dirty(*pvmw->pmd))
  3659. pmde = pmd_mksoft_dirty(pmde);
  3660. if (is_writable_migration_entry(entry))
  3661. pmde = pmd_mkwrite(pmde, vma);
  3662. if (pmd_swp_uffd_wp(*pvmw->pmd))
  3663. pmde = pmd_mkuffd_wp(pmde);
  3664. if (!is_migration_entry_young(entry))
  3665. pmde = pmd_mkold(pmde);
  3666. /* NOTE: this may contain setting soft-dirty on some archs */
  3667. if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
  3668. pmde = pmd_mkdirty(pmde);
  3669. if (folio_test_anon(folio)) {
  3670. rmap_t rmap_flags = RMAP_NONE;
  3671. if (!is_readable_migration_entry(entry))
  3672. rmap_flags |= RMAP_EXCLUSIVE;
  3673. folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
  3674. } else {
  3675. folio_add_file_rmap_pmd(folio, new, vma);
  3676. }
  3677. VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
  3678. set_pmd_at(mm, haddr, pvmw->pmd, pmde);
  3679. /* No need to invalidate - it was non-present before */
  3680. update_mmu_cache_pmd(vma, address, pvmw->pmd);
  3681. trace_remove_migration_pmd(address, pmd_val(pmde));
  3682. }
  3683. #endif