khugepaged.c 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814
  1. // SPDX-License-Identifier: GPL-2.0
  2. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  3. #include <linux/mm.h>
  4. #include <linux/sched.h>
  5. #include <linux/sched/mm.h>
  6. #include <linux/sched/coredump.h>
  7. #include <linux/mmu_notifier.h>
  8. #include <linux/rmap.h>
  9. #include <linux/swap.h>
  10. #include <linux/mm_inline.h>
  11. #include <linux/kthread.h>
  12. #include <linux/khugepaged.h>
  13. #include <linux/freezer.h>
  14. #include <linux/mman.h>
  15. #include <linux/hashtable.h>
  16. #include <linux/userfaultfd_k.h>
  17. #include <linux/page_idle.h>
  18. #include <linux/page_table_check.h>
  19. #include <linux/rcupdate_wait.h>
  20. #include <linux/swapops.h>
  21. #include <linux/shmem_fs.h>
  22. #include <linux/ksm.h>
  23. #include <asm/tlb.h>
  24. #include <asm/pgalloc.h>
  25. #include "internal.h"
  26. #include "mm_slot.h"
  27. enum scan_result {
  28. SCAN_FAIL,
  29. SCAN_SUCCEED,
  30. SCAN_PMD_NULL,
  31. SCAN_PMD_NONE,
  32. SCAN_PMD_MAPPED,
  33. SCAN_EXCEED_NONE_PTE,
  34. SCAN_EXCEED_SWAP_PTE,
  35. SCAN_EXCEED_SHARED_PTE,
  36. SCAN_PTE_NON_PRESENT,
  37. SCAN_PTE_UFFD_WP,
  38. SCAN_PTE_MAPPED_HUGEPAGE,
  39. SCAN_PAGE_RO,
  40. SCAN_LACK_REFERENCED_PAGE,
  41. SCAN_PAGE_NULL,
  42. SCAN_SCAN_ABORT,
  43. SCAN_PAGE_COUNT,
  44. SCAN_PAGE_LRU,
  45. SCAN_PAGE_LOCK,
  46. SCAN_PAGE_ANON,
  47. SCAN_PAGE_COMPOUND,
  48. SCAN_ANY_PROCESS,
  49. SCAN_VMA_NULL,
  50. SCAN_VMA_CHECK,
  51. SCAN_ADDRESS_RANGE,
  52. SCAN_DEL_PAGE_LRU,
  53. SCAN_ALLOC_HUGE_PAGE_FAIL,
  54. SCAN_CGROUP_CHARGE_FAIL,
  55. SCAN_TRUNCATED,
  56. SCAN_PAGE_HAS_PRIVATE,
  57. SCAN_STORE_FAILED,
  58. SCAN_COPY_MC,
  59. SCAN_PAGE_FILLED,
  60. };
  61. #define CREATE_TRACE_POINTS
  62. #include <trace/events/huge_memory.h>
  63. static struct task_struct *khugepaged_thread __read_mostly;
  64. static DEFINE_MUTEX(khugepaged_mutex);
  65. /* default scan 8*512 pte (or vmas) every 30 second */
  66. static unsigned int khugepaged_pages_to_scan __read_mostly;
  67. static unsigned int khugepaged_pages_collapsed;
  68. static unsigned int khugepaged_full_scans;
  69. static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
  70. /* during fragmentation poll the hugepage allocator once every minute */
  71. static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
  72. static unsigned long khugepaged_sleep_expire;
  73. static DEFINE_SPINLOCK(khugepaged_mm_lock);
  74. static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  75. /*
  76. * default collapse hugepages if there is at least one pte mapped like
  77. * it would have happened if the vma was large enough during page
  78. * fault.
  79. *
  80. * Note that these are only respected if collapse was initiated by khugepaged.
  81. */
  82. unsigned int khugepaged_max_ptes_none __read_mostly;
  83. static unsigned int khugepaged_max_ptes_swap __read_mostly;
  84. static unsigned int khugepaged_max_ptes_shared __read_mostly;
  85. #define MM_SLOTS_HASH_BITS 10
  86. static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
  87. static struct kmem_cache *mm_slot_cache __ro_after_init;
  88. struct collapse_control {
  89. bool is_khugepaged;
  90. /* Num pages scanned per node */
  91. u32 node_load[MAX_NUMNODES];
  92. /* nodemask for allocation fallback */
  93. nodemask_t alloc_nmask;
  94. };
  95. /**
  96. * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
  97. * @slot: hash lookup from mm to mm_slot
  98. */
  99. struct khugepaged_mm_slot {
  100. struct mm_slot slot;
  101. };
  102. /**
  103. * struct khugepaged_scan - cursor for scanning
  104. * @mm_head: the head of the mm list to scan
  105. * @mm_slot: the current mm_slot we are scanning
  106. * @address: the next address inside that to be scanned
  107. *
  108. * There is only the one khugepaged_scan instance of this cursor structure.
  109. */
  110. struct khugepaged_scan {
  111. struct list_head mm_head;
  112. struct khugepaged_mm_slot *mm_slot;
  113. unsigned long address;
  114. };
  115. static struct khugepaged_scan khugepaged_scan = {
  116. .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  117. };
  118. #ifdef CONFIG_SYSFS
  119. static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
  120. struct kobj_attribute *attr,
  121. char *buf)
  122. {
  123. return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
  124. }
  125. static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
  126. struct kobj_attribute *attr,
  127. const char *buf, size_t count)
  128. {
  129. unsigned int msecs;
  130. int err;
  131. err = kstrtouint(buf, 10, &msecs);
  132. if (err)
  133. return -EINVAL;
  134. khugepaged_scan_sleep_millisecs = msecs;
  135. khugepaged_sleep_expire = 0;
  136. wake_up_interruptible(&khugepaged_wait);
  137. return count;
  138. }
  139. static struct kobj_attribute scan_sleep_millisecs_attr =
  140. __ATTR_RW(scan_sleep_millisecs);
  141. static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
  142. struct kobj_attribute *attr,
  143. char *buf)
  144. {
  145. return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
  146. }
  147. static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
  148. struct kobj_attribute *attr,
  149. const char *buf, size_t count)
  150. {
  151. unsigned int msecs;
  152. int err;
  153. err = kstrtouint(buf, 10, &msecs);
  154. if (err)
  155. return -EINVAL;
  156. khugepaged_alloc_sleep_millisecs = msecs;
  157. khugepaged_sleep_expire = 0;
  158. wake_up_interruptible(&khugepaged_wait);
  159. return count;
  160. }
  161. static struct kobj_attribute alloc_sleep_millisecs_attr =
  162. __ATTR_RW(alloc_sleep_millisecs);
  163. static ssize_t pages_to_scan_show(struct kobject *kobj,
  164. struct kobj_attribute *attr,
  165. char *buf)
  166. {
  167. return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
  168. }
  169. static ssize_t pages_to_scan_store(struct kobject *kobj,
  170. struct kobj_attribute *attr,
  171. const char *buf, size_t count)
  172. {
  173. unsigned int pages;
  174. int err;
  175. err = kstrtouint(buf, 10, &pages);
  176. if (err || !pages)
  177. return -EINVAL;
  178. khugepaged_pages_to_scan = pages;
  179. return count;
  180. }
  181. static struct kobj_attribute pages_to_scan_attr =
  182. __ATTR_RW(pages_to_scan);
  183. static ssize_t pages_collapsed_show(struct kobject *kobj,
  184. struct kobj_attribute *attr,
  185. char *buf)
  186. {
  187. return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
  188. }
  189. static struct kobj_attribute pages_collapsed_attr =
  190. __ATTR_RO(pages_collapsed);
  191. static ssize_t full_scans_show(struct kobject *kobj,
  192. struct kobj_attribute *attr,
  193. char *buf)
  194. {
  195. return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
  196. }
  197. static struct kobj_attribute full_scans_attr =
  198. __ATTR_RO(full_scans);
  199. static ssize_t defrag_show(struct kobject *kobj,
  200. struct kobj_attribute *attr, char *buf)
  201. {
  202. return single_hugepage_flag_show(kobj, attr, buf,
  203. TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  204. }
  205. static ssize_t defrag_store(struct kobject *kobj,
  206. struct kobj_attribute *attr,
  207. const char *buf, size_t count)
  208. {
  209. return single_hugepage_flag_store(kobj, attr, buf, count,
  210. TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  211. }
  212. static struct kobj_attribute khugepaged_defrag_attr =
  213. __ATTR_RW(defrag);
  214. /*
  215. * max_ptes_none controls if khugepaged should collapse hugepages over
  216. * any unmapped ptes in turn potentially increasing the memory
  217. * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
  218. * reduce the available free memory in the system as it
  219. * runs. Increasing max_ptes_none will instead potentially reduce the
  220. * free memory in the system during the khugepaged scan.
  221. */
  222. static ssize_t max_ptes_none_show(struct kobject *kobj,
  223. struct kobj_attribute *attr,
  224. char *buf)
  225. {
  226. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
  227. }
  228. static ssize_t max_ptes_none_store(struct kobject *kobj,
  229. struct kobj_attribute *attr,
  230. const char *buf, size_t count)
  231. {
  232. int err;
  233. unsigned long max_ptes_none;
  234. err = kstrtoul(buf, 10, &max_ptes_none);
  235. if (err || max_ptes_none > HPAGE_PMD_NR - 1)
  236. return -EINVAL;
  237. khugepaged_max_ptes_none = max_ptes_none;
  238. return count;
  239. }
  240. static struct kobj_attribute khugepaged_max_ptes_none_attr =
  241. __ATTR_RW(max_ptes_none);
  242. static ssize_t max_ptes_swap_show(struct kobject *kobj,
  243. struct kobj_attribute *attr,
  244. char *buf)
  245. {
  246. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
  247. }
  248. static ssize_t max_ptes_swap_store(struct kobject *kobj,
  249. struct kobj_attribute *attr,
  250. const char *buf, size_t count)
  251. {
  252. int err;
  253. unsigned long max_ptes_swap;
  254. err = kstrtoul(buf, 10, &max_ptes_swap);
  255. if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
  256. return -EINVAL;
  257. khugepaged_max_ptes_swap = max_ptes_swap;
  258. return count;
  259. }
  260. static struct kobj_attribute khugepaged_max_ptes_swap_attr =
  261. __ATTR_RW(max_ptes_swap);
  262. static ssize_t max_ptes_shared_show(struct kobject *kobj,
  263. struct kobj_attribute *attr,
  264. char *buf)
  265. {
  266. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
  267. }
  268. static ssize_t max_ptes_shared_store(struct kobject *kobj,
  269. struct kobj_attribute *attr,
  270. const char *buf, size_t count)
  271. {
  272. int err;
  273. unsigned long max_ptes_shared;
  274. err = kstrtoul(buf, 10, &max_ptes_shared);
  275. if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
  276. return -EINVAL;
  277. khugepaged_max_ptes_shared = max_ptes_shared;
  278. return count;
  279. }
  280. static struct kobj_attribute khugepaged_max_ptes_shared_attr =
  281. __ATTR_RW(max_ptes_shared);
  282. static struct attribute *khugepaged_attr[] = {
  283. &khugepaged_defrag_attr.attr,
  284. &khugepaged_max_ptes_none_attr.attr,
  285. &khugepaged_max_ptes_swap_attr.attr,
  286. &khugepaged_max_ptes_shared_attr.attr,
  287. &pages_to_scan_attr.attr,
  288. &pages_collapsed_attr.attr,
  289. &full_scans_attr.attr,
  290. &scan_sleep_millisecs_attr.attr,
  291. &alloc_sleep_millisecs_attr.attr,
  292. NULL,
  293. };
  294. struct attribute_group khugepaged_attr_group = {
  295. .attrs = khugepaged_attr,
  296. .name = "khugepaged",
  297. };
  298. #endif /* CONFIG_SYSFS */
  299. int hugepage_madvise(struct vm_area_struct *vma,
  300. unsigned long *vm_flags, int advice)
  301. {
  302. switch (advice) {
  303. case MADV_HUGEPAGE:
  304. #ifdef CONFIG_S390
  305. /*
  306. * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
  307. * can't handle this properly after s390_enable_sie, so we simply
  308. * ignore the madvise to prevent qemu from causing a SIGSEGV.
  309. */
  310. if (mm_has_pgste(vma->vm_mm))
  311. return 0;
  312. #endif
  313. *vm_flags &= ~VM_NOHUGEPAGE;
  314. *vm_flags |= VM_HUGEPAGE;
  315. /*
  316. * If the vma become good for khugepaged to scan,
  317. * register it here without waiting a page fault that
  318. * may not happen any time soon.
  319. */
  320. khugepaged_enter_vma(vma, *vm_flags);
  321. break;
  322. case MADV_NOHUGEPAGE:
  323. *vm_flags &= ~VM_HUGEPAGE;
  324. *vm_flags |= VM_NOHUGEPAGE;
  325. /*
  326. * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
  327. * this vma even if we leave the mm registered in khugepaged if
  328. * it got registered before VM_NOHUGEPAGE was set.
  329. */
  330. break;
  331. }
  332. return 0;
  333. }
  334. int __init khugepaged_init(void)
  335. {
  336. mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
  337. if (!mm_slot_cache)
  338. return -ENOMEM;
  339. khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
  340. khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
  341. khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
  342. khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
  343. return 0;
  344. }
  345. void __init khugepaged_destroy(void)
  346. {
  347. kmem_cache_destroy(mm_slot_cache);
  348. }
  349. static inline int hpage_collapse_test_exit(struct mm_struct *mm)
  350. {
  351. return atomic_read(&mm->mm_users) == 0;
  352. }
  353. static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
  354. {
  355. return hpage_collapse_test_exit(mm) ||
  356. test_bit(MMF_DISABLE_THP, &mm->flags);
  357. }
  358. static bool hugepage_pmd_enabled(void)
  359. {
  360. /*
  361. * We cover both the anon and the file-backed case here; file-backed
  362. * hugepages, when configured in, are determined by the global control.
  363. * Anon pmd-sized hugepages are determined by the pmd-size control.
  364. */
  365. if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
  366. hugepage_global_enabled())
  367. return true;
  368. if (test_bit(PMD_ORDER, &huge_anon_orders_always))
  369. return true;
  370. if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
  371. return true;
  372. if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
  373. hugepage_global_enabled())
  374. return true;
  375. return false;
  376. }
  377. void __khugepaged_enter(struct mm_struct *mm)
  378. {
  379. struct khugepaged_mm_slot *mm_slot;
  380. struct mm_slot *slot;
  381. int wakeup;
  382. /* __khugepaged_exit() must not run from under us */
  383. VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
  384. if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
  385. return;
  386. mm_slot = mm_slot_alloc(mm_slot_cache);
  387. if (!mm_slot)
  388. return;
  389. slot = &mm_slot->slot;
  390. spin_lock(&khugepaged_mm_lock);
  391. mm_slot_insert(mm_slots_hash, mm, slot);
  392. /*
  393. * Insert just behind the scanning cursor, to let the area settle
  394. * down a little.
  395. */
  396. wakeup = list_empty(&khugepaged_scan.mm_head);
  397. list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
  398. spin_unlock(&khugepaged_mm_lock);
  399. mmgrab(mm);
  400. if (wakeup)
  401. wake_up_interruptible(&khugepaged_wait);
  402. }
  403. void khugepaged_enter_vma(struct vm_area_struct *vma,
  404. unsigned long vm_flags)
  405. {
  406. if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
  407. hugepage_pmd_enabled()) {
  408. if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
  409. PMD_ORDER))
  410. __khugepaged_enter(vma->vm_mm);
  411. }
  412. }
  413. void __khugepaged_exit(struct mm_struct *mm)
  414. {
  415. struct khugepaged_mm_slot *mm_slot;
  416. struct mm_slot *slot;
  417. int free = 0;
  418. spin_lock(&khugepaged_mm_lock);
  419. slot = mm_slot_lookup(mm_slots_hash, mm);
  420. mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  421. if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
  422. hash_del(&slot->hash);
  423. list_del(&slot->mm_node);
  424. free = 1;
  425. }
  426. spin_unlock(&khugepaged_mm_lock);
  427. if (free) {
  428. clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  429. mm_slot_free(mm_slot_cache, mm_slot);
  430. mmdrop(mm);
  431. } else if (mm_slot) {
  432. /*
  433. * This is required to serialize against
  434. * hpage_collapse_test_exit() (which is guaranteed to run
  435. * under mmap sem read mode). Stop here (after we return all
  436. * pagetables will be destroyed) until khugepaged has finished
  437. * working on the pagetables under the mmap_lock.
  438. */
  439. mmap_write_lock(mm);
  440. mmap_write_unlock(mm);
  441. }
  442. }
  443. static void release_pte_folio(struct folio *folio)
  444. {
  445. node_stat_mod_folio(folio,
  446. NR_ISOLATED_ANON + folio_is_file_lru(folio),
  447. -folio_nr_pages(folio));
  448. folio_unlock(folio);
  449. folio_putback_lru(folio);
  450. }
  451. static void release_pte_pages(pte_t *pte, pte_t *_pte,
  452. struct list_head *compound_pagelist)
  453. {
  454. struct folio *folio, *tmp;
  455. while (--_pte >= pte) {
  456. pte_t pteval = ptep_get(_pte);
  457. unsigned long pfn;
  458. if (pte_none(pteval))
  459. continue;
  460. pfn = pte_pfn(pteval);
  461. if (is_zero_pfn(pfn))
  462. continue;
  463. folio = pfn_folio(pfn);
  464. if (folio_test_large(folio))
  465. continue;
  466. release_pte_folio(folio);
  467. }
  468. list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
  469. list_del(&folio->lru);
  470. release_pte_folio(folio);
  471. }
  472. }
  473. static bool is_refcount_suitable(struct folio *folio)
  474. {
  475. int expected_refcount = folio_mapcount(folio);
  476. if (!folio_test_anon(folio) || folio_test_swapcache(folio))
  477. expected_refcount += folio_nr_pages(folio);
  478. if (folio_test_private(folio))
  479. expected_refcount++;
  480. return folio_ref_count(folio) == expected_refcount;
  481. }
  482. static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
  483. unsigned long address,
  484. pte_t *pte,
  485. struct collapse_control *cc,
  486. struct list_head *compound_pagelist)
  487. {
  488. struct page *page = NULL;
  489. struct folio *folio = NULL;
  490. pte_t *_pte;
  491. int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
  492. bool writable = false;
  493. for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
  494. _pte++, address += PAGE_SIZE) {
  495. pte_t pteval = ptep_get(_pte);
  496. if (pte_none(pteval) || (pte_present(pteval) &&
  497. is_zero_pfn(pte_pfn(pteval)))) {
  498. ++none_or_zero;
  499. if (!userfaultfd_armed(vma) &&
  500. (!cc->is_khugepaged ||
  501. none_or_zero <= khugepaged_max_ptes_none)) {
  502. continue;
  503. } else {
  504. result = SCAN_EXCEED_NONE_PTE;
  505. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  506. goto out;
  507. }
  508. }
  509. if (!pte_present(pteval)) {
  510. result = SCAN_PTE_NON_PRESENT;
  511. goto out;
  512. }
  513. if (pte_uffd_wp(pteval)) {
  514. result = SCAN_PTE_UFFD_WP;
  515. goto out;
  516. }
  517. page = vm_normal_page(vma, address, pteval);
  518. if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  519. result = SCAN_PAGE_NULL;
  520. goto out;
  521. }
  522. folio = page_folio(page);
  523. VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
  524. /* See hpage_collapse_scan_pmd(). */
  525. if (folio_likely_mapped_shared(folio)) {
  526. ++shared;
  527. if (cc->is_khugepaged &&
  528. shared > khugepaged_max_ptes_shared) {
  529. result = SCAN_EXCEED_SHARED_PTE;
  530. count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
  531. goto out;
  532. }
  533. }
  534. if (folio_test_large(folio)) {
  535. struct folio *f;
  536. /*
  537. * Check if we have dealt with the compound page
  538. * already
  539. */
  540. list_for_each_entry(f, compound_pagelist, lru) {
  541. if (folio == f)
  542. goto next;
  543. }
  544. }
  545. /*
  546. * We can do it before folio_isolate_lru because the
  547. * folio can't be freed from under us. NOTE: PG_lock
  548. * is needed to serialize against split_huge_page
  549. * when invoked from the VM.
  550. */
  551. if (!folio_trylock(folio)) {
  552. result = SCAN_PAGE_LOCK;
  553. goto out;
  554. }
  555. /*
  556. * Check if the page has any GUP (or other external) pins.
  557. *
  558. * The page table that maps the page has been already unlinked
  559. * from the page table tree and this process cannot get
  560. * an additional pin on the page.
  561. *
  562. * New pins can come later if the page is shared across fork,
  563. * but not from this process. The other process cannot write to
  564. * the page, only trigger CoW.
  565. */
  566. if (!is_refcount_suitable(folio)) {
  567. folio_unlock(folio);
  568. result = SCAN_PAGE_COUNT;
  569. goto out;
  570. }
  571. /*
  572. * Isolate the page to avoid collapsing an hugepage
  573. * currently in use by the VM.
  574. */
  575. if (!folio_isolate_lru(folio)) {
  576. folio_unlock(folio);
  577. result = SCAN_DEL_PAGE_LRU;
  578. goto out;
  579. }
  580. node_stat_mod_folio(folio,
  581. NR_ISOLATED_ANON + folio_is_file_lru(folio),
  582. folio_nr_pages(folio));
  583. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  584. VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  585. if (folio_test_large(folio))
  586. list_add_tail(&folio->lru, compound_pagelist);
  587. next:
  588. /*
  589. * If collapse was initiated by khugepaged, check that there is
  590. * enough young pte to justify collapsing the page
  591. */
  592. if (cc->is_khugepaged &&
  593. (pte_young(pteval) || folio_test_young(folio) ||
  594. folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
  595. address)))
  596. referenced++;
  597. if (pte_write(pteval))
  598. writable = true;
  599. }
  600. if (unlikely(!writable)) {
  601. result = SCAN_PAGE_RO;
  602. } else if (unlikely(cc->is_khugepaged && !referenced)) {
  603. result = SCAN_LACK_REFERENCED_PAGE;
  604. } else {
  605. result = SCAN_SUCCEED;
  606. trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
  607. referenced, writable, result);
  608. return result;
  609. }
  610. out:
  611. release_pte_pages(pte, _pte, compound_pagelist);
  612. trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
  613. referenced, writable, result);
  614. return result;
  615. }
  616. static void __collapse_huge_page_copy_succeeded(pte_t *pte,
  617. struct vm_area_struct *vma,
  618. unsigned long address,
  619. spinlock_t *ptl,
  620. struct list_head *compound_pagelist)
  621. {
  622. struct folio *src, *tmp;
  623. pte_t *_pte;
  624. pte_t pteval;
  625. for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
  626. _pte++, address += PAGE_SIZE) {
  627. pteval = ptep_get(_pte);
  628. if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  629. add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
  630. if (is_zero_pfn(pte_pfn(pteval))) {
  631. /*
  632. * ptl mostly unnecessary.
  633. */
  634. spin_lock(ptl);
  635. ptep_clear(vma->vm_mm, address, _pte);
  636. spin_unlock(ptl);
  637. ksm_might_unmap_zero_page(vma->vm_mm, pteval);
  638. }
  639. } else {
  640. struct page *src_page = pte_page(pteval);
  641. src = page_folio(src_page);
  642. if (!folio_test_large(src))
  643. release_pte_folio(src);
  644. /*
  645. * ptl mostly unnecessary, but preempt has to
  646. * be disabled to update the per-cpu stats
  647. * inside folio_remove_rmap_pte().
  648. */
  649. spin_lock(ptl);
  650. ptep_clear(vma->vm_mm, address, _pte);
  651. folio_remove_rmap_pte(src, src_page, vma);
  652. spin_unlock(ptl);
  653. free_page_and_swap_cache(src_page);
  654. }
  655. }
  656. list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
  657. list_del(&src->lru);
  658. node_stat_sub_folio(src, NR_ISOLATED_ANON +
  659. folio_is_file_lru(src));
  660. folio_unlock(src);
  661. free_swap_cache(src);
  662. folio_putback_lru(src);
  663. }
  664. }
  665. static void __collapse_huge_page_copy_failed(pte_t *pte,
  666. pmd_t *pmd,
  667. pmd_t orig_pmd,
  668. struct vm_area_struct *vma,
  669. struct list_head *compound_pagelist)
  670. {
  671. spinlock_t *pmd_ptl;
  672. /*
  673. * Re-establish the PMD to point to the original page table
  674. * entry. Restoring PMD needs to be done prior to releasing
  675. * pages. Since pages are still isolated and locked here,
  676. * acquiring anon_vma_lock_write is unnecessary.
  677. */
  678. pmd_ptl = pmd_lock(vma->vm_mm, pmd);
  679. pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
  680. spin_unlock(pmd_ptl);
  681. /*
  682. * Release both raw and compound pages isolated
  683. * in __collapse_huge_page_isolate.
  684. */
  685. release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
  686. }
  687. /*
  688. * __collapse_huge_page_copy - attempts to copy memory contents from raw
  689. * pages to a hugepage. Cleans up the raw pages if copying succeeds;
  690. * otherwise restores the original page table and releases isolated raw pages.
  691. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
  692. *
  693. * @pte: starting of the PTEs to copy from
  694. * @folio: the new hugepage to copy contents to
  695. * @pmd: pointer to the new hugepage's PMD
  696. * @orig_pmd: the original raw pages' PMD
  697. * @vma: the original raw pages' virtual memory area
  698. * @address: starting address to copy
  699. * @ptl: lock on raw pages' PTEs
  700. * @compound_pagelist: list that stores compound pages
  701. */
  702. static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
  703. pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
  704. unsigned long address, spinlock_t *ptl,
  705. struct list_head *compound_pagelist)
  706. {
  707. unsigned int i;
  708. int result = SCAN_SUCCEED;
  709. /*
  710. * Copying pages' contents is subject to memory poison at any iteration.
  711. */
  712. for (i = 0; i < HPAGE_PMD_NR; i++) {
  713. pte_t pteval = ptep_get(pte + i);
  714. struct page *page = folio_page(folio, i);
  715. unsigned long src_addr = address + i * PAGE_SIZE;
  716. struct page *src_page;
  717. if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  718. clear_user_highpage(page, src_addr);
  719. continue;
  720. }
  721. src_page = pte_page(pteval);
  722. if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
  723. result = SCAN_COPY_MC;
  724. break;
  725. }
  726. }
  727. if (likely(result == SCAN_SUCCEED))
  728. __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
  729. compound_pagelist);
  730. else
  731. __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
  732. compound_pagelist);
  733. return result;
  734. }
  735. static void khugepaged_alloc_sleep(void)
  736. {
  737. DEFINE_WAIT(wait);
  738. add_wait_queue(&khugepaged_wait, &wait);
  739. __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
  740. schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
  741. remove_wait_queue(&khugepaged_wait, &wait);
  742. }
  743. struct collapse_control khugepaged_collapse_control = {
  744. .is_khugepaged = true,
  745. };
  746. static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
  747. {
  748. int i;
  749. /*
  750. * If node_reclaim_mode is disabled, then no extra effort is made to
  751. * allocate memory locally.
  752. */
  753. if (!node_reclaim_enabled())
  754. return false;
  755. /* If there is a count for this node already, it must be acceptable */
  756. if (cc->node_load[nid])
  757. return false;
  758. for (i = 0; i < MAX_NUMNODES; i++) {
  759. if (!cc->node_load[i])
  760. continue;
  761. if (node_distance(nid, i) > node_reclaim_distance)
  762. return true;
  763. }
  764. return false;
  765. }
  766. #define khugepaged_defrag() \
  767. (transparent_hugepage_flags & \
  768. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
  769. /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
  770. static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
  771. {
  772. return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
  773. }
  774. #ifdef CONFIG_NUMA
  775. static int hpage_collapse_find_target_node(struct collapse_control *cc)
  776. {
  777. int nid, target_node = 0, max_value = 0;
  778. /* find first node with max normal pages hit */
  779. for (nid = 0; nid < MAX_NUMNODES; nid++)
  780. if (cc->node_load[nid] > max_value) {
  781. max_value = cc->node_load[nid];
  782. target_node = nid;
  783. }
  784. for_each_online_node(nid) {
  785. if (max_value == cc->node_load[nid])
  786. node_set(nid, cc->alloc_nmask);
  787. }
  788. return target_node;
  789. }
  790. #else
  791. static int hpage_collapse_find_target_node(struct collapse_control *cc)
  792. {
  793. return 0;
  794. }
  795. #endif
  796. /*
  797. * If mmap_lock temporarily dropped, revalidate vma
  798. * before taking mmap_lock.
  799. * Returns enum scan_result value.
  800. */
  801. static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
  802. bool expect_anon,
  803. struct vm_area_struct **vmap,
  804. struct collapse_control *cc)
  805. {
  806. struct vm_area_struct *vma;
  807. unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
  808. if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
  809. return SCAN_ANY_PROCESS;
  810. *vmap = vma = find_vma(mm, address);
  811. if (!vma)
  812. return SCAN_VMA_NULL;
  813. if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
  814. return SCAN_ADDRESS_RANGE;
  815. if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
  816. return SCAN_VMA_CHECK;
  817. /*
  818. * Anon VMA expected, the address may be unmapped then
  819. * remapped to file after khugepaged reaquired the mmap_lock.
  820. *
  821. * thp_vma_allowable_order may return true for qualified file
  822. * vmas.
  823. */
  824. if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
  825. return SCAN_PAGE_ANON;
  826. return SCAN_SUCCEED;
  827. }
  828. static int find_pmd_or_thp_or_none(struct mm_struct *mm,
  829. unsigned long address,
  830. pmd_t **pmd)
  831. {
  832. pmd_t pmde;
  833. *pmd = mm_find_pmd(mm, address);
  834. if (!*pmd)
  835. return SCAN_PMD_NULL;
  836. pmde = pmdp_get_lockless(*pmd);
  837. if (pmd_none(pmde))
  838. return SCAN_PMD_NONE;
  839. if (!pmd_present(pmde))
  840. return SCAN_PMD_NULL;
  841. if (pmd_trans_huge(pmde))
  842. return SCAN_PMD_MAPPED;
  843. if (pmd_devmap(pmde))
  844. return SCAN_PMD_NULL;
  845. if (pmd_bad(pmde))
  846. return SCAN_PMD_NULL;
  847. return SCAN_SUCCEED;
  848. }
  849. static int check_pmd_still_valid(struct mm_struct *mm,
  850. unsigned long address,
  851. pmd_t *pmd)
  852. {
  853. pmd_t *new_pmd;
  854. int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
  855. if (result != SCAN_SUCCEED)
  856. return result;
  857. if (new_pmd != pmd)
  858. return SCAN_FAIL;
  859. return SCAN_SUCCEED;
  860. }
  861. /*
  862. * Bring missing pages in from swap, to complete THP collapse.
  863. * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
  864. *
  865. * Called and returns without pte mapped or spinlocks held.
  866. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
  867. */
  868. static int __collapse_huge_page_swapin(struct mm_struct *mm,
  869. struct vm_area_struct *vma,
  870. unsigned long haddr, pmd_t *pmd,
  871. int referenced)
  872. {
  873. int swapped_in = 0;
  874. vm_fault_t ret = 0;
  875. unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
  876. int result;
  877. pte_t *pte = NULL;
  878. spinlock_t *ptl;
  879. for (address = haddr; address < end; address += PAGE_SIZE) {
  880. struct vm_fault vmf = {
  881. .vma = vma,
  882. .address = address,
  883. .pgoff = linear_page_index(vma, address),
  884. .flags = FAULT_FLAG_ALLOW_RETRY,
  885. .pmd = pmd,
  886. };
  887. if (!pte++) {
  888. pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
  889. if (!pte) {
  890. mmap_read_unlock(mm);
  891. result = SCAN_PMD_NULL;
  892. goto out;
  893. }
  894. }
  895. vmf.orig_pte = ptep_get_lockless(pte);
  896. if (!is_swap_pte(vmf.orig_pte))
  897. continue;
  898. vmf.pte = pte;
  899. vmf.ptl = ptl;
  900. ret = do_swap_page(&vmf);
  901. /* Which unmaps pte (after perhaps re-checking the entry) */
  902. pte = NULL;
  903. /*
  904. * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
  905. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
  906. * we do not retry here and swap entry will remain in pagetable
  907. * resulting in later failure.
  908. */
  909. if (ret & VM_FAULT_RETRY) {
  910. /* Likely, but not guaranteed, that page lock failed */
  911. result = SCAN_PAGE_LOCK;
  912. goto out;
  913. }
  914. if (ret & VM_FAULT_ERROR) {
  915. mmap_read_unlock(mm);
  916. result = SCAN_FAIL;
  917. goto out;
  918. }
  919. swapped_in++;
  920. }
  921. if (pte)
  922. pte_unmap(pte);
  923. /* Drain LRU cache to remove extra pin on the swapped in pages */
  924. if (swapped_in)
  925. lru_add_drain();
  926. result = SCAN_SUCCEED;
  927. out:
  928. trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
  929. return result;
  930. }
  931. static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
  932. struct collapse_control *cc)
  933. {
  934. gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
  935. GFP_TRANSHUGE);
  936. int node = hpage_collapse_find_target_node(cc);
  937. struct folio *folio;
  938. folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
  939. if (!folio) {
  940. *foliop = NULL;
  941. count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
  942. return SCAN_ALLOC_HUGE_PAGE_FAIL;
  943. }
  944. count_vm_event(THP_COLLAPSE_ALLOC);
  945. if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
  946. folio_put(folio);
  947. *foliop = NULL;
  948. return SCAN_CGROUP_CHARGE_FAIL;
  949. }
  950. count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
  951. *foliop = folio;
  952. return SCAN_SUCCEED;
  953. }
  954. static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
  955. int referenced, int unmapped,
  956. struct collapse_control *cc)
  957. {
  958. LIST_HEAD(compound_pagelist);
  959. pmd_t *pmd, _pmd;
  960. pte_t *pte;
  961. pgtable_t pgtable;
  962. struct folio *folio;
  963. spinlock_t *pmd_ptl, *pte_ptl;
  964. int result = SCAN_FAIL;
  965. struct vm_area_struct *vma;
  966. struct mmu_notifier_range range;
  967. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  968. /*
  969. * Before allocating the hugepage, release the mmap_lock read lock.
  970. * The allocation can take potentially a long time if it involves
  971. * sync compaction, and we do not need to hold the mmap_lock during
  972. * that. We will recheck the vma after taking it again in write mode.
  973. */
  974. mmap_read_unlock(mm);
  975. result = alloc_charge_folio(&folio, mm, cc);
  976. if (result != SCAN_SUCCEED)
  977. goto out_nolock;
  978. mmap_read_lock(mm);
  979. result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  980. if (result != SCAN_SUCCEED) {
  981. mmap_read_unlock(mm);
  982. goto out_nolock;
  983. }
  984. result = find_pmd_or_thp_or_none(mm, address, &pmd);
  985. if (result != SCAN_SUCCEED) {
  986. mmap_read_unlock(mm);
  987. goto out_nolock;
  988. }
  989. if (unmapped) {
  990. /*
  991. * __collapse_huge_page_swapin will return with mmap_lock
  992. * released when it fails. So we jump out_nolock directly in
  993. * that case. Continuing to collapse causes inconsistency.
  994. */
  995. result = __collapse_huge_page_swapin(mm, vma, address, pmd,
  996. referenced);
  997. if (result != SCAN_SUCCEED)
  998. goto out_nolock;
  999. }
  1000. mmap_read_unlock(mm);
  1001. /*
  1002. * Prevent all access to pagetables with the exception of
  1003. * gup_fast later handled by the ptep_clear_flush and the VM
  1004. * handled by the anon_vma lock + PG_lock.
  1005. *
  1006. * UFFDIO_MOVE is prevented to race as well thanks to the
  1007. * mmap_lock.
  1008. */
  1009. mmap_write_lock(mm);
  1010. result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  1011. if (result != SCAN_SUCCEED)
  1012. goto out_up_write;
  1013. /* check if the pmd is still valid */
  1014. result = check_pmd_still_valid(mm, address, pmd);
  1015. if (result != SCAN_SUCCEED)
  1016. goto out_up_write;
  1017. vma_start_write(vma);
  1018. anon_vma_lock_write(vma->anon_vma);
  1019. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
  1020. address + HPAGE_PMD_SIZE);
  1021. mmu_notifier_invalidate_range_start(&range);
  1022. pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
  1023. /*
  1024. * This removes any huge TLB entry from the CPU so we won't allow
  1025. * huge and small TLB entries for the same virtual address to
  1026. * avoid the risk of CPU bugs in that area.
  1027. *
  1028. * Parallel GUP-fast is fine since GUP-fast will back off when
  1029. * it detects PMD is changed.
  1030. */
  1031. _pmd = pmdp_collapse_flush(vma, address, pmd);
  1032. spin_unlock(pmd_ptl);
  1033. mmu_notifier_invalidate_range_end(&range);
  1034. tlb_remove_table_sync_one();
  1035. pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
  1036. if (pte) {
  1037. result = __collapse_huge_page_isolate(vma, address, pte, cc,
  1038. &compound_pagelist);
  1039. spin_unlock(pte_ptl);
  1040. } else {
  1041. result = SCAN_PMD_NULL;
  1042. }
  1043. if (unlikely(result != SCAN_SUCCEED)) {
  1044. if (pte)
  1045. pte_unmap(pte);
  1046. spin_lock(pmd_ptl);
  1047. BUG_ON(!pmd_none(*pmd));
  1048. /*
  1049. * We can only use set_pmd_at when establishing
  1050. * hugepmds and never for establishing regular pmds that
  1051. * points to regular pagetables. Use pmd_populate for that
  1052. */
  1053. pmd_populate(mm, pmd, pmd_pgtable(_pmd));
  1054. spin_unlock(pmd_ptl);
  1055. anon_vma_unlock_write(vma->anon_vma);
  1056. goto out_up_write;
  1057. }
  1058. /*
  1059. * All pages are isolated and locked so anon_vma rmap
  1060. * can't run anymore.
  1061. */
  1062. anon_vma_unlock_write(vma->anon_vma);
  1063. result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
  1064. vma, address, pte_ptl,
  1065. &compound_pagelist);
  1066. pte_unmap(pte);
  1067. if (unlikely(result != SCAN_SUCCEED))
  1068. goto out_up_write;
  1069. /*
  1070. * The smp_wmb() inside __folio_mark_uptodate() ensures the
  1071. * copy_huge_page writes become visible before the set_pmd_at()
  1072. * write.
  1073. */
  1074. __folio_mark_uptodate(folio);
  1075. pgtable = pmd_pgtable(_pmd);
  1076. _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
  1077. _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
  1078. spin_lock(pmd_ptl);
  1079. BUG_ON(!pmd_none(*pmd));
  1080. folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
  1081. folio_add_lru_vma(folio, vma);
  1082. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1083. set_pmd_at(mm, address, pmd, _pmd);
  1084. update_mmu_cache_pmd(vma, address, pmd);
  1085. deferred_split_folio(folio, false);
  1086. spin_unlock(pmd_ptl);
  1087. folio = NULL;
  1088. result = SCAN_SUCCEED;
  1089. out_up_write:
  1090. mmap_write_unlock(mm);
  1091. out_nolock:
  1092. if (folio)
  1093. folio_put(folio);
  1094. trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
  1095. return result;
  1096. }
  1097. static int hpage_collapse_scan_pmd(struct mm_struct *mm,
  1098. struct vm_area_struct *vma,
  1099. unsigned long address, bool *mmap_locked,
  1100. struct collapse_control *cc)
  1101. {
  1102. pmd_t *pmd;
  1103. pte_t *pte, *_pte;
  1104. int result = SCAN_FAIL, referenced = 0;
  1105. int none_or_zero = 0, shared = 0;
  1106. struct page *page = NULL;
  1107. struct folio *folio = NULL;
  1108. unsigned long _address;
  1109. spinlock_t *ptl;
  1110. int node = NUMA_NO_NODE, unmapped = 0;
  1111. bool writable = false;
  1112. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1113. result = find_pmd_or_thp_or_none(mm, address, &pmd);
  1114. if (result != SCAN_SUCCEED)
  1115. goto out;
  1116. memset(cc->node_load, 0, sizeof(cc->node_load));
  1117. nodes_clear(cc->alloc_nmask);
  1118. pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  1119. if (!pte) {
  1120. result = SCAN_PMD_NULL;
  1121. goto out;
  1122. }
  1123. for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
  1124. _pte++, _address += PAGE_SIZE) {
  1125. pte_t pteval = ptep_get(_pte);
  1126. if (is_swap_pte(pteval)) {
  1127. ++unmapped;
  1128. if (!cc->is_khugepaged ||
  1129. unmapped <= khugepaged_max_ptes_swap) {
  1130. /*
  1131. * Always be strict with uffd-wp
  1132. * enabled swap entries. Please see
  1133. * comment below for pte_uffd_wp().
  1134. */
  1135. if (pte_swp_uffd_wp_any(pteval)) {
  1136. result = SCAN_PTE_UFFD_WP;
  1137. goto out_unmap;
  1138. }
  1139. continue;
  1140. } else {
  1141. result = SCAN_EXCEED_SWAP_PTE;
  1142. count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
  1143. goto out_unmap;
  1144. }
  1145. }
  1146. if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  1147. ++none_or_zero;
  1148. if (!userfaultfd_armed(vma) &&
  1149. (!cc->is_khugepaged ||
  1150. none_or_zero <= khugepaged_max_ptes_none)) {
  1151. continue;
  1152. } else {
  1153. result = SCAN_EXCEED_NONE_PTE;
  1154. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  1155. goto out_unmap;
  1156. }
  1157. }
  1158. if (pte_uffd_wp(pteval)) {
  1159. /*
  1160. * Don't collapse the page if any of the small
  1161. * PTEs are armed with uffd write protection.
  1162. * Here we can also mark the new huge pmd as
  1163. * write protected if any of the small ones is
  1164. * marked but that could bring unknown
  1165. * userfault messages that falls outside of
  1166. * the registered range. So, just be simple.
  1167. */
  1168. result = SCAN_PTE_UFFD_WP;
  1169. goto out_unmap;
  1170. }
  1171. if (pte_write(pteval))
  1172. writable = true;
  1173. page = vm_normal_page(vma, _address, pteval);
  1174. if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  1175. result = SCAN_PAGE_NULL;
  1176. goto out_unmap;
  1177. }
  1178. folio = page_folio(page);
  1179. if (!folio_test_anon(folio)) {
  1180. result = SCAN_PAGE_ANON;
  1181. goto out_unmap;
  1182. }
  1183. /*
  1184. * We treat a single page as shared if any part of the THP
  1185. * is shared. "False negatives" from
  1186. * folio_likely_mapped_shared() are not expected to matter
  1187. * much in practice.
  1188. */
  1189. if (folio_likely_mapped_shared(folio)) {
  1190. ++shared;
  1191. if (cc->is_khugepaged &&
  1192. shared > khugepaged_max_ptes_shared) {
  1193. result = SCAN_EXCEED_SHARED_PTE;
  1194. count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
  1195. goto out_unmap;
  1196. }
  1197. }
  1198. /*
  1199. * Record which node the original page is from and save this
  1200. * information to cc->node_load[].
  1201. * Khugepaged will allocate hugepage from the node has the max
  1202. * hit record.
  1203. */
  1204. node = folio_nid(folio);
  1205. if (hpage_collapse_scan_abort(node, cc)) {
  1206. result = SCAN_SCAN_ABORT;
  1207. goto out_unmap;
  1208. }
  1209. cc->node_load[node]++;
  1210. if (!folio_test_lru(folio)) {
  1211. result = SCAN_PAGE_LRU;
  1212. goto out_unmap;
  1213. }
  1214. if (folio_test_locked(folio)) {
  1215. result = SCAN_PAGE_LOCK;
  1216. goto out_unmap;
  1217. }
  1218. /*
  1219. * Check if the page has any GUP (or other external) pins.
  1220. *
  1221. * Here the check may be racy:
  1222. * it may see folio_mapcount() > folio_ref_count().
  1223. * But such case is ephemeral we could always retry collapse
  1224. * later. However it may report false positive if the page
  1225. * has excessive GUP pins (i.e. 512). Anyway the same check
  1226. * will be done again later the risk seems low.
  1227. */
  1228. if (!is_refcount_suitable(folio)) {
  1229. result = SCAN_PAGE_COUNT;
  1230. goto out_unmap;
  1231. }
  1232. /*
  1233. * If collapse was initiated by khugepaged, check that there is
  1234. * enough young pte to justify collapsing the page
  1235. */
  1236. if (cc->is_khugepaged &&
  1237. (pte_young(pteval) || folio_test_young(folio) ||
  1238. folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
  1239. address)))
  1240. referenced++;
  1241. }
  1242. if (!writable) {
  1243. result = SCAN_PAGE_RO;
  1244. } else if (cc->is_khugepaged &&
  1245. (!referenced ||
  1246. (unmapped && referenced < HPAGE_PMD_NR / 2))) {
  1247. result = SCAN_LACK_REFERENCED_PAGE;
  1248. } else {
  1249. result = SCAN_SUCCEED;
  1250. }
  1251. out_unmap:
  1252. pte_unmap_unlock(pte, ptl);
  1253. if (result == SCAN_SUCCEED) {
  1254. result = collapse_huge_page(mm, address, referenced,
  1255. unmapped, cc);
  1256. /* collapse_huge_page will return with the mmap_lock released */
  1257. *mmap_locked = false;
  1258. }
  1259. out:
  1260. trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
  1261. none_or_zero, result, unmapped);
  1262. return result;
  1263. }
  1264. static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
  1265. {
  1266. struct mm_slot *slot = &mm_slot->slot;
  1267. struct mm_struct *mm = slot->mm;
  1268. lockdep_assert_held(&khugepaged_mm_lock);
  1269. if (hpage_collapse_test_exit(mm)) {
  1270. /* free mm_slot */
  1271. hash_del(&slot->hash);
  1272. list_del(&slot->mm_node);
  1273. /*
  1274. * Not strictly needed because the mm exited already.
  1275. *
  1276. * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  1277. */
  1278. /* khugepaged_mm_lock actually not necessary for the below */
  1279. mm_slot_free(mm_slot_cache, mm_slot);
  1280. mmdrop(mm);
  1281. }
  1282. }
  1283. #ifdef CONFIG_SHMEM
  1284. /* hpage must be locked, and mmap_lock must be held */
  1285. static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
  1286. pmd_t *pmdp, struct page *hpage)
  1287. {
  1288. struct vm_fault vmf = {
  1289. .vma = vma,
  1290. .address = addr,
  1291. .flags = 0,
  1292. .pmd = pmdp,
  1293. };
  1294. VM_BUG_ON(!PageTransHuge(hpage));
  1295. mmap_assert_locked(vma->vm_mm);
  1296. if (do_set_pmd(&vmf, hpage))
  1297. return SCAN_FAIL;
  1298. get_page(hpage);
  1299. return SCAN_SUCCEED;
  1300. }
  1301. /**
  1302. * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  1303. * address haddr.
  1304. *
  1305. * @mm: process address space where collapse happens
  1306. * @addr: THP collapse address
  1307. * @install_pmd: If a huge PMD should be installed
  1308. *
  1309. * This function checks whether all the PTEs in the PMD are pointing to the
  1310. * right THP. If so, retract the page table so the THP can refault in with
  1311. * as pmd-mapped. Possibly install a huge PMD mapping the THP.
  1312. */
  1313. int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
  1314. bool install_pmd)
  1315. {
  1316. struct mmu_notifier_range range;
  1317. bool notified = false;
  1318. unsigned long haddr = addr & HPAGE_PMD_MASK;
  1319. struct vm_area_struct *vma = vma_lookup(mm, haddr);
  1320. struct folio *folio;
  1321. pte_t *start_pte, *pte;
  1322. pmd_t *pmd, pgt_pmd;
  1323. spinlock_t *pml = NULL, *ptl;
  1324. int nr_ptes = 0, result = SCAN_FAIL;
  1325. int i;
  1326. mmap_assert_locked(mm);
  1327. /* First check VMA found, in case page tables are being torn down */
  1328. if (!vma || !vma->vm_file ||
  1329. !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
  1330. return SCAN_VMA_CHECK;
  1331. /* Fast check before locking page if already PMD-mapped */
  1332. result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
  1333. if (result == SCAN_PMD_MAPPED)
  1334. return result;
  1335. /*
  1336. * If we are here, we've succeeded in replacing all the native pages
  1337. * in the page cache with a single hugepage. If a mm were to fault-in
  1338. * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
  1339. * and map it by a PMD, regardless of sysfs THP settings. As such, let's
  1340. * analogously elide sysfs THP settings here.
  1341. */
  1342. if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
  1343. return SCAN_VMA_CHECK;
  1344. /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
  1345. if (userfaultfd_wp(vma))
  1346. return SCAN_PTE_UFFD_WP;
  1347. folio = filemap_lock_folio(vma->vm_file->f_mapping,
  1348. linear_page_index(vma, haddr));
  1349. if (IS_ERR(folio))
  1350. return SCAN_PAGE_NULL;
  1351. if (folio_order(folio) != HPAGE_PMD_ORDER) {
  1352. result = SCAN_PAGE_COMPOUND;
  1353. goto drop_folio;
  1354. }
  1355. result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
  1356. switch (result) {
  1357. case SCAN_SUCCEED:
  1358. break;
  1359. case SCAN_PMD_NONE:
  1360. /*
  1361. * All pte entries have been removed and pmd cleared.
  1362. * Skip all the pte checks and just update the pmd mapping.
  1363. */
  1364. goto maybe_install_pmd;
  1365. default:
  1366. goto drop_folio;
  1367. }
  1368. result = SCAN_FAIL;
  1369. start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
  1370. if (!start_pte) /* mmap_lock + page lock should prevent this */
  1371. goto drop_folio;
  1372. /* step 1: check all mapped PTEs are to the right huge page */
  1373. for (i = 0, addr = haddr, pte = start_pte;
  1374. i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
  1375. struct page *page;
  1376. pte_t ptent = ptep_get(pte);
  1377. /* empty pte, skip */
  1378. if (pte_none(ptent))
  1379. continue;
  1380. /* page swapped out, abort */
  1381. if (!pte_present(ptent)) {
  1382. result = SCAN_PTE_NON_PRESENT;
  1383. goto abort;
  1384. }
  1385. page = vm_normal_page(vma, addr, ptent);
  1386. if (WARN_ON_ONCE(page && is_zone_device_page(page)))
  1387. page = NULL;
  1388. /*
  1389. * Note that uprobe, debugger, or MAP_PRIVATE may change the
  1390. * page table, but the new page will not be a subpage of hpage.
  1391. */
  1392. if (folio_page(folio, i) != page)
  1393. goto abort;
  1394. }
  1395. pte_unmap_unlock(start_pte, ptl);
  1396. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
  1397. haddr, haddr + HPAGE_PMD_SIZE);
  1398. mmu_notifier_invalidate_range_start(&range);
  1399. notified = true;
  1400. /*
  1401. * pmd_lock covers a wider range than ptl, and (if split from mm's
  1402. * page_table_lock) ptl nests inside pml. The less time we hold pml,
  1403. * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
  1404. * inserts a valid as-if-COWed PTE without even looking up page cache.
  1405. * So page lock of folio does not protect from it, so we must not drop
  1406. * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
  1407. */
  1408. if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
  1409. pml = pmd_lock(mm, pmd);
  1410. start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
  1411. if (!start_pte) /* mmap_lock + page lock should prevent this */
  1412. goto abort;
  1413. if (!pml)
  1414. spin_lock(ptl);
  1415. else if (ptl != pml)
  1416. spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
  1417. /* step 2: clear page table and adjust rmap */
  1418. for (i = 0, addr = haddr, pte = start_pte;
  1419. i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
  1420. struct page *page;
  1421. pte_t ptent = ptep_get(pte);
  1422. if (pte_none(ptent))
  1423. continue;
  1424. /*
  1425. * We dropped ptl after the first scan, to do the mmu_notifier:
  1426. * page lock stops more PTEs of the folio being faulted in, but
  1427. * does not stop write faults COWing anon copies from existing
  1428. * PTEs; and does not stop those being swapped out or migrated.
  1429. */
  1430. if (!pte_present(ptent)) {
  1431. result = SCAN_PTE_NON_PRESENT;
  1432. goto abort;
  1433. }
  1434. page = vm_normal_page(vma, addr, ptent);
  1435. if (folio_page(folio, i) != page)
  1436. goto abort;
  1437. /*
  1438. * Must clear entry, or a racing truncate may re-remove it.
  1439. * TLB flush can be left until pmdp_collapse_flush() does it.
  1440. * PTE dirty? Shmem page is already dirty; file is read-only.
  1441. */
  1442. ptep_clear(mm, addr, pte);
  1443. folio_remove_rmap_pte(folio, page, vma);
  1444. nr_ptes++;
  1445. }
  1446. pte_unmap(start_pte);
  1447. if (!pml)
  1448. spin_unlock(ptl);
  1449. /* step 3: set proper refcount and mm_counters. */
  1450. if (nr_ptes) {
  1451. folio_ref_sub(folio, nr_ptes);
  1452. add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
  1453. }
  1454. /* step 4: remove empty page table */
  1455. if (!pml) {
  1456. pml = pmd_lock(mm, pmd);
  1457. if (ptl != pml)
  1458. spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
  1459. }
  1460. pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
  1461. pmdp_get_lockless_sync();
  1462. if (ptl != pml)
  1463. spin_unlock(ptl);
  1464. spin_unlock(pml);
  1465. mmu_notifier_invalidate_range_end(&range);
  1466. mm_dec_nr_ptes(mm);
  1467. page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
  1468. pte_free_defer(mm, pmd_pgtable(pgt_pmd));
  1469. maybe_install_pmd:
  1470. /* step 5: install pmd entry */
  1471. result = install_pmd
  1472. ? set_huge_pmd(vma, haddr, pmd, &folio->page)
  1473. : SCAN_SUCCEED;
  1474. goto drop_folio;
  1475. abort:
  1476. if (nr_ptes) {
  1477. flush_tlb_mm(mm);
  1478. folio_ref_sub(folio, nr_ptes);
  1479. add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
  1480. }
  1481. if (start_pte)
  1482. pte_unmap_unlock(start_pte, ptl);
  1483. if (pml && pml != ptl)
  1484. spin_unlock(pml);
  1485. if (notified)
  1486. mmu_notifier_invalidate_range_end(&range);
  1487. drop_folio:
  1488. folio_unlock(folio);
  1489. folio_put(folio);
  1490. return result;
  1491. }
  1492. static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  1493. {
  1494. struct vm_area_struct *vma;
  1495. i_mmap_lock_read(mapping);
  1496. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  1497. struct mmu_notifier_range range;
  1498. struct mm_struct *mm;
  1499. unsigned long addr;
  1500. pmd_t *pmd, pgt_pmd;
  1501. spinlock_t *pml;
  1502. spinlock_t *ptl;
  1503. bool skipped_uffd = false;
  1504. /*
  1505. * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
  1506. * got written to. These VMAs are likely not worth removing
  1507. * page tables from, as PMD-mapping is likely to be split later.
  1508. */
  1509. if (READ_ONCE(vma->anon_vma))
  1510. continue;
  1511. addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1512. if (addr & ~HPAGE_PMD_MASK ||
  1513. vma->vm_end < addr + HPAGE_PMD_SIZE)
  1514. continue;
  1515. mm = vma->vm_mm;
  1516. if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
  1517. continue;
  1518. if (hpage_collapse_test_exit(mm))
  1519. continue;
  1520. /*
  1521. * When a vma is registered with uffd-wp, we cannot recycle
  1522. * the page table because there may be pte markers installed.
  1523. * Other vmas can still have the same file mapped hugely, but
  1524. * skip this one: it will always be mapped in small page size
  1525. * for uffd-wp registered ranges.
  1526. */
  1527. if (userfaultfd_wp(vma))
  1528. continue;
  1529. /* PTEs were notified when unmapped; but now for the PMD? */
  1530. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
  1531. addr, addr + HPAGE_PMD_SIZE);
  1532. mmu_notifier_invalidate_range_start(&range);
  1533. pml = pmd_lock(mm, pmd);
  1534. ptl = pte_lockptr(mm, pmd);
  1535. if (ptl != pml)
  1536. spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
  1537. /*
  1538. * Huge page lock is still held, so normally the page table
  1539. * must remain empty; and we have already skipped anon_vma
  1540. * and userfaultfd_wp() vmas. But since the mmap_lock is not
  1541. * held, it is still possible for a racing userfaultfd_ioctl()
  1542. * to have inserted ptes or markers. Now that we hold ptlock,
  1543. * repeating the anon_vma check protects from one category,
  1544. * and repeating the userfaultfd_wp() check from another.
  1545. */
  1546. if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
  1547. skipped_uffd = true;
  1548. } else {
  1549. pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
  1550. pmdp_get_lockless_sync();
  1551. }
  1552. if (ptl != pml)
  1553. spin_unlock(ptl);
  1554. spin_unlock(pml);
  1555. mmu_notifier_invalidate_range_end(&range);
  1556. if (!skipped_uffd) {
  1557. mm_dec_nr_ptes(mm);
  1558. page_table_check_pte_clear_range(mm, addr, pgt_pmd);
  1559. pte_free_defer(mm, pmd_pgtable(pgt_pmd));
  1560. }
  1561. }
  1562. i_mmap_unlock_read(mapping);
  1563. }
  1564. /**
  1565. * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
  1566. *
  1567. * @mm: process address space where collapse happens
  1568. * @addr: virtual collapse start address
  1569. * @file: file that collapse on
  1570. * @start: collapse start address
  1571. * @cc: collapse context and scratchpad
  1572. *
  1573. * Basic scheme is simple, details are more complex:
  1574. * - allocate and lock a new huge page;
  1575. * - scan page cache, locking old pages
  1576. * + swap/gup in pages if necessary;
  1577. * - copy data to new page
  1578. * - handle shmem holes
  1579. * + re-validate that holes weren't filled by someone else
  1580. * + check for userfaultfd
  1581. * - finalize updates to the page cache;
  1582. * - if replacing succeeds:
  1583. * + unlock huge page;
  1584. * + free old pages;
  1585. * - if replacing failed;
  1586. * + unlock old pages
  1587. * + unlock and free huge page;
  1588. */
  1589. static int collapse_file(struct mm_struct *mm, unsigned long addr,
  1590. struct file *file, pgoff_t start,
  1591. struct collapse_control *cc)
  1592. {
  1593. struct address_space *mapping = file->f_mapping;
  1594. struct page *dst;
  1595. struct folio *folio, *tmp, *new_folio;
  1596. pgoff_t index = 0, end = start + HPAGE_PMD_NR;
  1597. LIST_HEAD(pagelist);
  1598. XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
  1599. int nr_none = 0, result = SCAN_SUCCEED;
  1600. bool is_shmem = shmem_file(file);
  1601. VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
  1602. VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
  1603. result = alloc_charge_folio(&new_folio, mm, cc);
  1604. if (result != SCAN_SUCCEED)
  1605. goto out;
  1606. __folio_set_locked(new_folio);
  1607. if (is_shmem)
  1608. __folio_set_swapbacked(new_folio);
  1609. new_folio->index = start;
  1610. new_folio->mapping = mapping;
  1611. /*
  1612. * Ensure we have slots for all the pages in the range. This is
  1613. * almost certainly a no-op because most of the pages must be present
  1614. */
  1615. do {
  1616. xas_lock_irq(&xas);
  1617. xas_create_range(&xas);
  1618. if (!xas_error(&xas))
  1619. break;
  1620. xas_unlock_irq(&xas);
  1621. if (!xas_nomem(&xas, GFP_KERNEL)) {
  1622. result = SCAN_FAIL;
  1623. goto rollback;
  1624. }
  1625. } while (1);
  1626. for (index = start; index < end;) {
  1627. xas_set(&xas, index);
  1628. folio = xas_load(&xas);
  1629. VM_BUG_ON(index != xas.xa_index);
  1630. if (is_shmem) {
  1631. if (!folio) {
  1632. /*
  1633. * Stop if extent has been truncated or
  1634. * hole-punched, and is now completely
  1635. * empty.
  1636. */
  1637. if (index == start) {
  1638. if (!xas_next_entry(&xas, end - 1)) {
  1639. result = SCAN_TRUNCATED;
  1640. goto xa_locked;
  1641. }
  1642. }
  1643. nr_none++;
  1644. index++;
  1645. continue;
  1646. }
  1647. if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
  1648. xas_unlock_irq(&xas);
  1649. /* swap in or instantiate fallocated page */
  1650. if (shmem_get_folio(mapping->host, index, 0,
  1651. &folio, SGP_NOALLOC)) {
  1652. result = SCAN_FAIL;
  1653. goto xa_unlocked;
  1654. }
  1655. /* drain lru cache to help folio_isolate_lru() */
  1656. lru_add_drain();
  1657. } else if (folio_trylock(folio)) {
  1658. folio_get(folio);
  1659. xas_unlock_irq(&xas);
  1660. } else {
  1661. result = SCAN_PAGE_LOCK;
  1662. goto xa_locked;
  1663. }
  1664. } else { /* !is_shmem */
  1665. if (!folio || xa_is_value(folio)) {
  1666. xas_unlock_irq(&xas);
  1667. page_cache_sync_readahead(mapping, &file->f_ra,
  1668. file, index,
  1669. end - index);
  1670. /* drain lru cache to help folio_isolate_lru() */
  1671. lru_add_drain();
  1672. folio = filemap_lock_folio(mapping, index);
  1673. if (IS_ERR(folio)) {
  1674. result = SCAN_FAIL;
  1675. goto xa_unlocked;
  1676. }
  1677. } else if (folio_test_dirty(folio)) {
  1678. /*
  1679. * khugepaged only works on read-only fd,
  1680. * so this page is dirty because it hasn't
  1681. * been flushed since first write. There
  1682. * won't be new dirty pages.
  1683. *
  1684. * Trigger async flush here and hope the
  1685. * writeback is done when khugepaged
  1686. * revisits this page.
  1687. *
  1688. * This is a one-off situation. We are not
  1689. * forcing writeback in loop.
  1690. */
  1691. xas_unlock_irq(&xas);
  1692. filemap_flush(mapping);
  1693. result = SCAN_FAIL;
  1694. goto xa_unlocked;
  1695. } else if (folio_test_writeback(folio)) {
  1696. xas_unlock_irq(&xas);
  1697. result = SCAN_FAIL;
  1698. goto xa_unlocked;
  1699. } else if (folio_trylock(folio)) {
  1700. folio_get(folio);
  1701. xas_unlock_irq(&xas);
  1702. } else {
  1703. result = SCAN_PAGE_LOCK;
  1704. goto xa_locked;
  1705. }
  1706. }
  1707. /*
  1708. * The folio must be locked, so we can drop the i_pages lock
  1709. * without racing with truncate.
  1710. */
  1711. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1712. /* make sure the folio is up to date */
  1713. if (unlikely(!folio_test_uptodate(folio))) {
  1714. result = SCAN_FAIL;
  1715. goto out_unlock;
  1716. }
  1717. /*
  1718. * If file was truncated then extended, or hole-punched, before
  1719. * we locked the first folio, then a THP might be there already.
  1720. * This will be discovered on the first iteration.
  1721. */
  1722. if (folio_order(folio) == HPAGE_PMD_ORDER &&
  1723. folio->index == start) {
  1724. /* Maybe PMD-mapped */
  1725. result = SCAN_PTE_MAPPED_HUGEPAGE;
  1726. goto out_unlock;
  1727. }
  1728. if (folio_mapping(folio) != mapping) {
  1729. result = SCAN_TRUNCATED;
  1730. goto out_unlock;
  1731. }
  1732. if (!is_shmem && (folio_test_dirty(folio) ||
  1733. folio_test_writeback(folio))) {
  1734. /*
  1735. * khugepaged only works on read-only fd, so this
  1736. * folio is dirty because it hasn't been flushed
  1737. * since first write.
  1738. */
  1739. result = SCAN_FAIL;
  1740. goto out_unlock;
  1741. }
  1742. if (!folio_isolate_lru(folio)) {
  1743. result = SCAN_DEL_PAGE_LRU;
  1744. goto out_unlock;
  1745. }
  1746. if (!filemap_release_folio(folio, GFP_KERNEL)) {
  1747. result = SCAN_PAGE_HAS_PRIVATE;
  1748. folio_putback_lru(folio);
  1749. goto out_unlock;
  1750. }
  1751. if (folio_mapped(folio))
  1752. try_to_unmap(folio,
  1753. TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
  1754. xas_lock_irq(&xas);
  1755. VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
  1756. /*
  1757. * We control 2 + nr_pages references to the folio:
  1758. * - we hold a pin on it;
  1759. * - nr_pages reference from page cache;
  1760. * - one from lru_isolate_folio;
  1761. * If those are the only references, then any new usage
  1762. * of the folio will have to fetch it from the page
  1763. * cache. That requires locking the folio to handle
  1764. * truncate, so any new usage will be blocked until we
  1765. * unlock folio after collapse/during rollback.
  1766. */
  1767. if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
  1768. result = SCAN_PAGE_COUNT;
  1769. xas_unlock_irq(&xas);
  1770. folio_putback_lru(folio);
  1771. goto out_unlock;
  1772. }
  1773. /*
  1774. * Accumulate the folios that are being collapsed.
  1775. */
  1776. list_add_tail(&folio->lru, &pagelist);
  1777. index += folio_nr_pages(folio);
  1778. continue;
  1779. out_unlock:
  1780. folio_unlock(folio);
  1781. folio_put(folio);
  1782. goto xa_unlocked;
  1783. }
  1784. if (!is_shmem) {
  1785. filemap_nr_thps_inc(mapping);
  1786. /*
  1787. * Paired with the fence in do_dentry_open() -> get_write_access()
  1788. * to ensure i_writecount is up to date and the update to nr_thps
  1789. * is visible. Ensures the page cache will be truncated if the
  1790. * file is opened writable.
  1791. */
  1792. smp_mb();
  1793. if (inode_is_open_for_write(mapping->host)) {
  1794. result = SCAN_FAIL;
  1795. filemap_nr_thps_dec(mapping);
  1796. }
  1797. }
  1798. xa_locked:
  1799. xas_unlock_irq(&xas);
  1800. xa_unlocked:
  1801. /*
  1802. * If collapse is successful, flush must be done now before copying.
  1803. * If collapse is unsuccessful, does flush actually need to be done?
  1804. * Do it anyway, to clear the state.
  1805. */
  1806. try_to_unmap_flush();
  1807. if (result == SCAN_SUCCEED && nr_none &&
  1808. !shmem_charge(mapping->host, nr_none))
  1809. result = SCAN_FAIL;
  1810. if (result != SCAN_SUCCEED) {
  1811. nr_none = 0;
  1812. goto rollback;
  1813. }
  1814. /*
  1815. * The old folios are locked, so they won't change anymore.
  1816. */
  1817. index = start;
  1818. dst = folio_page(new_folio, 0);
  1819. list_for_each_entry(folio, &pagelist, lru) {
  1820. int i, nr_pages = folio_nr_pages(folio);
  1821. while (index < folio->index) {
  1822. clear_highpage(dst);
  1823. index++;
  1824. dst++;
  1825. }
  1826. for (i = 0; i < nr_pages; i++) {
  1827. if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
  1828. result = SCAN_COPY_MC;
  1829. goto rollback;
  1830. }
  1831. index++;
  1832. dst++;
  1833. }
  1834. }
  1835. while (index < end) {
  1836. clear_highpage(dst);
  1837. index++;
  1838. dst++;
  1839. }
  1840. if (nr_none) {
  1841. struct vm_area_struct *vma;
  1842. int nr_none_check = 0;
  1843. i_mmap_lock_read(mapping);
  1844. xas_lock_irq(&xas);
  1845. xas_set(&xas, start);
  1846. for (index = start; index < end; index++) {
  1847. if (!xas_next(&xas)) {
  1848. xas_store(&xas, XA_RETRY_ENTRY);
  1849. if (xas_error(&xas)) {
  1850. result = SCAN_STORE_FAILED;
  1851. goto immap_locked;
  1852. }
  1853. nr_none_check++;
  1854. }
  1855. }
  1856. if (nr_none != nr_none_check) {
  1857. result = SCAN_PAGE_FILLED;
  1858. goto immap_locked;
  1859. }
  1860. /*
  1861. * If userspace observed a missing page in a VMA with
  1862. * a MODE_MISSING userfaultfd, then it might expect a
  1863. * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
  1864. * roll back to avoid suppressing such an event. Since
  1865. * wp/minor userfaultfds don't give userspace any
  1866. * guarantees that the kernel doesn't fill a missing
  1867. * page with a zero page, so they don't matter here.
  1868. *
  1869. * Any userfaultfds registered after this point will
  1870. * not be able to observe any missing pages due to the
  1871. * previously inserted retry entries.
  1872. */
  1873. vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
  1874. if (userfaultfd_missing(vma)) {
  1875. result = SCAN_EXCEED_NONE_PTE;
  1876. goto immap_locked;
  1877. }
  1878. }
  1879. immap_locked:
  1880. i_mmap_unlock_read(mapping);
  1881. if (result != SCAN_SUCCEED) {
  1882. xas_set(&xas, start);
  1883. for (index = start; index < end; index++) {
  1884. if (xas_next(&xas) == XA_RETRY_ENTRY)
  1885. xas_store(&xas, NULL);
  1886. }
  1887. xas_unlock_irq(&xas);
  1888. goto rollback;
  1889. }
  1890. } else {
  1891. xas_lock_irq(&xas);
  1892. }
  1893. if (is_shmem)
  1894. __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
  1895. else
  1896. __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
  1897. if (nr_none) {
  1898. __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
  1899. /* nr_none is always 0 for non-shmem. */
  1900. __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
  1901. }
  1902. /*
  1903. * Mark new_folio as uptodate before inserting it into the
  1904. * page cache so that it isn't mistaken for an fallocated but
  1905. * unwritten page.
  1906. */
  1907. folio_mark_uptodate(new_folio);
  1908. folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
  1909. if (is_shmem)
  1910. folio_mark_dirty(new_folio);
  1911. folio_add_lru(new_folio);
  1912. /* Join all the small entries into a single multi-index entry. */
  1913. xas_set_order(&xas, start, HPAGE_PMD_ORDER);
  1914. xas_store(&xas, new_folio);
  1915. WARN_ON_ONCE(xas_error(&xas));
  1916. xas_unlock_irq(&xas);
  1917. /*
  1918. * Remove pte page tables, so we can re-fault the page as huge.
  1919. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
  1920. */
  1921. retract_page_tables(mapping, start);
  1922. if (cc && !cc->is_khugepaged)
  1923. result = SCAN_PTE_MAPPED_HUGEPAGE;
  1924. folio_unlock(new_folio);
  1925. /*
  1926. * The collapse has succeeded, so free the old folios.
  1927. */
  1928. list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
  1929. list_del(&folio->lru);
  1930. folio->mapping = NULL;
  1931. folio_clear_active(folio);
  1932. folio_clear_unevictable(folio);
  1933. folio_unlock(folio);
  1934. folio_put_refs(folio, 2 + folio_nr_pages(folio));
  1935. }
  1936. goto out;
  1937. rollback:
  1938. /* Something went wrong: roll back page cache changes */
  1939. if (nr_none) {
  1940. xas_lock_irq(&xas);
  1941. mapping->nrpages -= nr_none;
  1942. xas_unlock_irq(&xas);
  1943. shmem_uncharge(mapping->host, nr_none);
  1944. }
  1945. list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
  1946. list_del(&folio->lru);
  1947. folio_unlock(folio);
  1948. folio_putback_lru(folio);
  1949. folio_put(folio);
  1950. }
  1951. /*
  1952. * Undo the updates of filemap_nr_thps_inc for non-SHMEM
  1953. * file only. This undo is not needed unless failure is
  1954. * due to SCAN_COPY_MC.
  1955. */
  1956. if (!is_shmem && result == SCAN_COPY_MC) {
  1957. filemap_nr_thps_dec(mapping);
  1958. /*
  1959. * Paired with the fence in do_dentry_open() -> get_write_access()
  1960. * to ensure the update to nr_thps is visible.
  1961. */
  1962. smp_mb();
  1963. }
  1964. new_folio->mapping = NULL;
  1965. folio_unlock(new_folio);
  1966. folio_put(new_folio);
  1967. out:
  1968. VM_BUG_ON(!list_empty(&pagelist));
  1969. trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
  1970. return result;
  1971. }
  1972. static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
  1973. struct file *file, pgoff_t start,
  1974. struct collapse_control *cc)
  1975. {
  1976. struct folio *folio = NULL;
  1977. struct address_space *mapping = file->f_mapping;
  1978. XA_STATE(xas, &mapping->i_pages, start);
  1979. int present, swap;
  1980. int node = NUMA_NO_NODE;
  1981. int result = SCAN_SUCCEED;
  1982. present = 0;
  1983. swap = 0;
  1984. memset(cc->node_load, 0, sizeof(cc->node_load));
  1985. nodes_clear(cc->alloc_nmask);
  1986. rcu_read_lock();
  1987. xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
  1988. if (xas_retry(&xas, folio))
  1989. continue;
  1990. if (xa_is_value(folio)) {
  1991. swap += 1 << xas_get_order(&xas);
  1992. if (cc->is_khugepaged &&
  1993. swap > khugepaged_max_ptes_swap) {
  1994. result = SCAN_EXCEED_SWAP_PTE;
  1995. count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
  1996. break;
  1997. }
  1998. continue;
  1999. }
  2000. if (folio_order(folio) == HPAGE_PMD_ORDER &&
  2001. folio->index == start) {
  2002. /* Maybe PMD-mapped */
  2003. result = SCAN_PTE_MAPPED_HUGEPAGE;
  2004. /*
  2005. * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
  2006. * by the caller won't touch the page cache, and so
  2007. * it's safe to skip LRU and refcount checks before
  2008. * returning.
  2009. */
  2010. break;
  2011. }
  2012. node = folio_nid(folio);
  2013. if (hpage_collapse_scan_abort(node, cc)) {
  2014. result = SCAN_SCAN_ABORT;
  2015. break;
  2016. }
  2017. cc->node_load[node]++;
  2018. if (!folio_test_lru(folio)) {
  2019. result = SCAN_PAGE_LRU;
  2020. break;
  2021. }
  2022. if (!is_refcount_suitable(folio)) {
  2023. result = SCAN_PAGE_COUNT;
  2024. break;
  2025. }
  2026. /*
  2027. * We probably should check if the folio is referenced
  2028. * here, but nobody would transfer pte_young() to
  2029. * folio_test_referenced() for us. And rmap walk here
  2030. * is just too costly...
  2031. */
  2032. present += folio_nr_pages(folio);
  2033. if (need_resched()) {
  2034. xas_pause(&xas);
  2035. cond_resched_rcu();
  2036. }
  2037. }
  2038. rcu_read_unlock();
  2039. if (result == SCAN_SUCCEED) {
  2040. if (cc->is_khugepaged &&
  2041. present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
  2042. result = SCAN_EXCEED_NONE_PTE;
  2043. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  2044. } else {
  2045. result = collapse_file(mm, addr, file, start, cc);
  2046. }
  2047. }
  2048. trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
  2049. return result;
  2050. }
  2051. #else
  2052. static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
  2053. struct file *file, pgoff_t start,
  2054. struct collapse_control *cc)
  2055. {
  2056. BUILD_BUG();
  2057. }
  2058. #endif
  2059. static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
  2060. struct collapse_control *cc)
  2061. __releases(&khugepaged_mm_lock)
  2062. __acquires(&khugepaged_mm_lock)
  2063. {
  2064. struct vma_iterator vmi;
  2065. struct khugepaged_mm_slot *mm_slot;
  2066. struct mm_slot *slot;
  2067. struct mm_struct *mm;
  2068. struct vm_area_struct *vma;
  2069. int progress = 0;
  2070. VM_BUG_ON(!pages);
  2071. lockdep_assert_held(&khugepaged_mm_lock);
  2072. *result = SCAN_FAIL;
  2073. if (khugepaged_scan.mm_slot) {
  2074. mm_slot = khugepaged_scan.mm_slot;
  2075. slot = &mm_slot->slot;
  2076. } else {
  2077. slot = list_entry(khugepaged_scan.mm_head.next,
  2078. struct mm_slot, mm_node);
  2079. mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  2080. khugepaged_scan.address = 0;
  2081. khugepaged_scan.mm_slot = mm_slot;
  2082. }
  2083. spin_unlock(&khugepaged_mm_lock);
  2084. mm = slot->mm;
  2085. /*
  2086. * Don't wait for semaphore (to avoid long wait times). Just move to
  2087. * the next mm on the list.
  2088. */
  2089. vma = NULL;
  2090. if (unlikely(!mmap_read_trylock(mm)))
  2091. goto breakouterloop_mmap_lock;
  2092. progress++;
  2093. if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
  2094. goto breakouterloop;
  2095. vma_iter_init(&vmi, mm, khugepaged_scan.address);
  2096. for_each_vma(vmi, vma) {
  2097. unsigned long hstart, hend;
  2098. cond_resched();
  2099. if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
  2100. progress++;
  2101. break;
  2102. }
  2103. if (!thp_vma_allowable_order(vma, vma->vm_flags,
  2104. TVA_ENFORCE_SYSFS, PMD_ORDER)) {
  2105. skip:
  2106. progress++;
  2107. continue;
  2108. }
  2109. hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
  2110. hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
  2111. if (khugepaged_scan.address > hend)
  2112. goto skip;
  2113. if (khugepaged_scan.address < hstart)
  2114. khugepaged_scan.address = hstart;
  2115. VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
  2116. while (khugepaged_scan.address < hend) {
  2117. bool mmap_locked = true;
  2118. cond_resched();
  2119. if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
  2120. goto breakouterloop;
  2121. VM_BUG_ON(khugepaged_scan.address < hstart ||
  2122. khugepaged_scan.address + HPAGE_PMD_SIZE >
  2123. hend);
  2124. if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
  2125. struct file *file = get_file(vma->vm_file);
  2126. pgoff_t pgoff = linear_page_index(vma,
  2127. khugepaged_scan.address);
  2128. mmap_read_unlock(mm);
  2129. mmap_locked = false;
  2130. *result = hpage_collapse_scan_file(mm,
  2131. khugepaged_scan.address, file, pgoff, cc);
  2132. fput(file);
  2133. if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
  2134. mmap_read_lock(mm);
  2135. if (hpage_collapse_test_exit_or_disable(mm))
  2136. goto breakouterloop;
  2137. *result = collapse_pte_mapped_thp(mm,
  2138. khugepaged_scan.address, false);
  2139. if (*result == SCAN_PMD_MAPPED)
  2140. *result = SCAN_SUCCEED;
  2141. mmap_read_unlock(mm);
  2142. }
  2143. } else {
  2144. *result = hpage_collapse_scan_pmd(mm, vma,
  2145. khugepaged_scan.address, &mmap_locked, cc);
  2146. }
  2147. if (*result == SCAN_SUCCEED)
  2148. ++khugepaged_pages_collapsed;
  2149. /* move to next address */
  2150. khugepaged_scan.address += HPAGE_PMD_SIZE;
  2151. progress += HPAGE_PMD_NR;
  2152. if (!mmap_locked)
  2153. /*
  2154. * We released mmap_lock so break loop. Note
  2155. * that we drop mmap_lock before all hugepage
  2156. * allocations, so if allocation fails, we are
  2157. * guaranteed to break here and report the
  2158. * correct result back to caller.
  2159. */
  2160. goto breakouterloop_mmap_lock;
  2161. if (progress >= pages)
  2162. goto breakouterloop;
  2163. }
  2164. }
  2165. breakouterloop:
  2166. mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
  2167. breakouterloop_mmap_lock:
  2168. spin_lock(&khugepaged_mm_lock);
  2169. VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
  2170. /*
  2171. * Release the current mm_slot if this mm is about to die, or
  2172. * if we scanned all vmas of this mm.
  2173. */
  2174. if (hpage_collapse_test_exit(mm) || !vma) {
  2175. /*
  2176. * Make sure that if mm_users is reaching zero while
  2177. * khugepaged runs here, khugepaged_exit will find
  2178. * mm_slot not pointing to the exiting mm.
  2179. */
  2180. if (slot->mm_node.next != &khugepaged_scan.mm_head) {
  2181. slot = list_entry(slot->mm_node.next,
  2182. struct mm_slot, mm_node);
  2183. khugepaged_scan.mm_slot =
  2184. mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  2185. khugepaged_scan.address = 0;
  2186. } else {
  2187. khugepaged_scan.mm_slot = NULL;
  2188. khugepaged_full_scans++;
  2189. }
  2190. collect_mm_slot(mm_slot);
  2191. }
  2192. return progress;
  2193. }
  2194. static int khugepaged_has_work(void)
  2195. {
  2196. return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
  2197. }
  2198. static int khugepaged_wait_event(void)
  2199. {
  2200. return !list_empty(&khugepaged_scan.mm_head) ||
  2201. kthread_should_stop();
  2202. }
  2203. static void khugepaged_do_scan(struct collapse_control *cc)
  2204. {
  2205. unsigned int progress = 0, pass_through_head = 0;
  2206. unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
  2207. bool wait = true;
  2208. int result = SCAN_SUCCEED;
  2209. lru_add_drain_all();
  2210. while (true) {
  2211. cond_resched();
  2212. if (unlikely(kthread_should_stop()))
  2213. break;
  2214. spin_lock(&khugepaged_mm_lock);
  2215. if (!khugepaged_scan.mm_slot)
  2216. pass_through_head++;
  2217. if (khugepaged_has_work() &&
  2218. pass_through_head < 2)
  2219. progress += khugepaged_scan_mm_slot(pages - progress,
  2220. &result, cc);
  2221. else
  2222. progress = pages;
  2223. spin_unlock(&khugepaged_mm_lock);
  2224. if (progress >= pages)
  2225. break;
  2226. if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
  2227. /*
  2228. * If fail to allocate the first time, try to sleep for
  2229. * a while. When hit again, cancel the scan.
  2230. */
  2231. if (!wait)
  2232. break;
  2233. wait = false;
  2234. khugepaged_alloc_sleep();
  2235. }
  2236. }
  2237. }
  2238. static bool khugepaged_should_wakeup(void)
  2239. {
  2240. return kthread_should_stop() ||
  2241. time_after_eq(jiffies, khugepaged_sleep_expire);
  2242. }
  2243. static void khugepaged_wait_work(void)
  2244. {
  2245. if (khugepaged_has_work()) {
  2246. const unsigned long scan_sleep_jiffies =
  2247. msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
  2248. if (!scan_sleep_jiffies)
  2249. return;
  2250. khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
  2251. wait_event_freezable_timeout(khugepaged_wait,
  2252. khugepaged_should_wakeup(),
  2253. scan_sleep_jiffies);
  2254. return;
  2255. }
  2256. if (hugepage_pmd_enabled())
  2257. wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
  2258. }
  2259. static int khugepaged(void *none)
  2260. {
  2261. struct khugepaged_mm_slot *mm_slot;
  2262. set_freezable();
  2263. set_user_nice(current, MAX_NICE);
  2264. while (!kthread_should_stop()) {
  2265. khugepaged_do_scan(&khugepaged_collapse_control);
  2266. khugepaged_wait_work();
  2267. }
  2268. spin_lock(&khugepaged_mm_lock);
  2269. mm_slot = khugepaged_scan.mm_slot;
  2270. khugepaged_scan.mm_slot = NULL;
  2271. if (mm_slot)
  2272. collect_mm_slot(mm_slot);
  2273. spin_unlock(&khugepaged_mm_lock);
  2274. return 0;
  2275. }
  2276. static void set_recommended_min_free_kbytes(void)
  2277. {
  2278. struct zone *zone;
  2279. int nr_zones = 0;
  2280. unsigned long recommended_min;
  2281. if (!hugepage_pmd_enabled()) {
  2282. calculate_min_free_kbytes();
  2283. goto update_wmarks;
  2284. }
  2285. for_each_populated_zone(zone) {
  2286. /*
  2287. * We don't need to worry about fragmentation of
  2288. * ZONE_MOVABLE since it only has movable pages.
  2289. */
  2290. if (zone_idx(zone) > gfp_zone(GFP_USER))
  2291. continue;
  2292. nr_zones++;
  2293. }
  2294. /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
  2295. recommended_min = pageblock_nr_pages * nr_zones * 2;
  2296. /*
  2297. * Make sure that on average at least two pageblocks are almost free
  2298. * of another type, one for a migratetype to fall back to and a
  2299. * second to avoid subsequent fallbacks of other types There are 3
  2300. * MIGRATE_TYPES we care about.
  2301. */
  2302. recommended_min += pageblock_nr_pages * nr_zones *
  2303. MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
  2304. /* don't ever allow to reserve more than 5% of the lowmem */
  2305. recommended_min = min(recommended_min,
  2306. (unsigned long) nr_free_buffer_pages() / 20);
  2307. recommended_min <<= (PAGE_SHIFT-10);
  2308. if (recommended_min > min_free_kbytes) {
  2309. if (user_min_free_kbytes >= 0)
  2310. pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
  2311. min_free_kbytes, recommended_min);
  2312. min_free_kbytes = recommended_min;
  2313. }
  2314. update_wmarks:
  2315. setup_per_zone_wmarks();
  2316. }
  2317. int start_stop_khugepaged(void)
  2318. {
  2319. int err = 0;
  2320. mutex_lock(&khugepaged_mutex);
  2321. if (hugepage_pmd_enabled()) {
  2322. if (!khugepaged_thread)
  2323. khugepaged_thread = kthread_run(khugepaged, NULL,
  2324. "khugepaged");
  2325. if (IS_ERR(khugepaged_thread)) {
  2326. pr_err("khugepaged: kthread_run(khugepaged) failed\n");
  2327. err = PTR_ERR(khugepaged_thread);
  2328. khugepaged_thread = NULL;
  2329. goto fail;
  2330. }
  2331. if (!list_empty(&khugepaged_scan.mm_head))
  2332. wake_up_interruptible(&khugepaged_wait);
  2333. } else if (khugepaged_thread) {
  2334. kthread_stop(khugepaged_thread);
  2335. khugepaged_thread = NULL;
  2336. }
  2337. set_recommended_min_free_kbytes();
  2338. fail:
  2339. mutex_unlock(&khugepaged_mutex);
  2340. return err;
  2341. }
  2342. void khugepaged_min_free_kbytes_update(void)
  2343. {
  2344. mutex_lock(&khugepaged_mutex);
  2345. if (hugepage_pmd_enabled() && khugepaged_thread)
  2346. set_recommended_min_free_kbytes();
  2347. mutex_unlock(&khugepaged_mutex);
  2348. }
  2349. bool current_is_khugepaged(void)
  2350. {
  2351. return kthread_func(current) == khugepaged;
  2352. }
  2353. static int madvise_collapse_errno(enum scan_result r)
  2354. {
  2355. /*
  2356. * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
  2357. * actionable feedback to caller, so they may take an appropriate
  2358. * fallback measure depending on the nature of the failure.
  2359. */
  2360. switch (r) {
  2361. case SCAN_ALLOC_HUGE_PAGE_FAIL:
  2362. return -ENOMEM;
  2363. case SCAN_CGROUP_CHARGE_FAIL:
  2364. case SCAN_EXCEED_NONE_PTE:
  2365. return -EBUSY;
  2366. /* Resource temporary unavailable - trying again might succeed */
  2367. case SCAN_PAGE_COUNT:
  2368. case SCAN_PAGE_LOCK:
  2369. case SCAN_PAGE_LRU:
  2370. case SCAN_DEL_PAGE_LRU:
  2371. case SCAN_PAGE_FILLED:
  2372. return -EAGAIN;
  2373. /*
  2374. * Other: Trying again likely not to succeed / error intrinsic to
  2375. * specified memory range. khugepaged likely won't be able to collapse
  2376. * either.
  2377. */
  2378. default:
  2379. return -EINVAL;
  2380. }
  2381. }
  2382. int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
  2383. unsigned long start, unsigned long end)
  2384. {
  2385. struct collapse_control *cc;
  2386. struct mm_struct *mm = vma->vm_mm;
  2387. unsigned long hstart, hend, addr;
  2388. int thps = 0, last_fail = SCAN_FAIL;
  2389. bool mmap_locked = true;
  2390. BUG_ON(vma->vm_start > start);
  2391. BUG_ON(vma->vm_end < end);
  2392. *prev = vma;
  2393. if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
  2394. return -EINVAL;
  2395. cc = kmalloc(sizeof(*cc), GFP_KERNEL);
  2396. if (!cc)
  2397. return -ENOMEM;
  2398. cc->is_khugepaged = false;
  2399. mmgrab(mm);
  2400. lru_add_drain_all();
  2401. hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
  2402. hend = end & HPAGE_PMD_MASK;
  2403. for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
  2404. int result = SCAN_FAIL;
  2405. if (!mmap_locked) {
  2406. cond_resched();
  2407. mmap_read_lock(mm);
  2408. mmap_locked = true;
  2409. result = hugepage_vma_revalidate(mm, addr, false, &vma,
  2410. cc);
  2411. if (result != SCAN_SUCCEED) {
  2412. last_fail = result;
  2413. goto out_nolock;
  2414. }
  2415. hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
  2416. }
  2417. mmap_assert_locked(mm);
  2418. memset(cc->node_load, 0, sizeof(cc->node_load));
  2419. nodes_clear(cc->alloc_nmask);
  2420. if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
  2421. struct file *file = get_file(vma->vm_file);
  2422. pgoff_t pgoff = linear_page_index(vma, addr);
  2423. mmap_read_unlock(mm);
  2424. mmap_locked = false;
  2425. result = hpage_collapse_scan_file(mm, addr, file, pgoff,
  2426. cc);
  2427. fput(file);
  2428. } else {
  2429. result = hpage_collapse_scan_pmd(mm, vma, addr,
  2430. &mmap_locked, cc);
  2431. }
  2432. if (!mmap_locked)
  2433. *prev = NULL; /* Tell caller we dropped mmap_lock */
  2434. handle_result:
  2435. switch (result) {
  2436. case SCAN_SUCCEED:
  2437. case SCAN_PMD_MAPPED:
  2438. ++thps;
  2439. break;
  2440. case SCAN_PTE_MAPPED_HUGEPAGE:
  2441. BUG_ON(mmap_locked);
  2442. BUG_ON(*prev);
  2443. mmap_read_lock(mm);
  2444. result = collapse_pte_mapped_thp(mm, addr, true);
  2445. mmap_read_unlock(mm);
  2446. goto handle_result;
  2447. /* Whitelisted set of results where continuing OK */
  2448. case SCAN_PMD_NULL:
  2449. case SCAN_PTE_NON_PRESENT:
  2450. case SCAN_PTE_UFFD_WP:
  2451. case SCAN_PAGE_RO:
  2452. case SCAN_LACK_REFERENCED_PAGE:
  2453. case SCAN_PAGE_NULL:
  2454. case SCAN_PAGE_COUNT:
  2455. case SCAN_PAGE_LOCK:
  2456. case SCAN_PAGE_COMPOUND:
  2457. case SCAN_PAGE_LRU:
  2458. case SCAN_DEL_PAGE_LRU:
  2459. last_fail = result;
  2460. break;
  2461. default:
  2462. last_fail = result;
  2463. /* Other error, exit */
  2464. goto out_maybelock;
  2465. }
  2466. }
  2467. out_maybelock:
  2468. /* Caller expects us to hold mmap_lock on return */
  2469. if (!mmap_locked)
  2470. mmap_read_lock(mm);
  2471. out_nolock:
  2472. mmap_assert_locked(mm);
  2473. mmdrop(mm);
  2474. kfree(cc);
  2475. return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
  2476. : madvise_collapse_errno(last_fail);
  2477. }