memory-failure.c 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2008, 2009 Intel Corporation
  4. * Authors: Andi Kleen, Fengguang Wu
  5. *
  6. * High level machine check handler. Handles pages reported by the
  7. * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  8. * failure.
  9. *
  10. * In addition there is a "soft offline" entry point that allows stop using
  11. * not-yet-corrupted-by-suspicious pages without killing anything.
  12. *
  13. * Handles page cache pages in various states. The tricky part
  14. * here is that we can access any page asynchronously in respect to
  15. * other VM users, because memory failures could happen anytime and
  16. * anywhere. This could violate some of their assumptions. This is why
  17. * this code has to be extremely careful. Generally it tries to use
  18. * normal locking rules, as in get the standard locks, even if that means
  19. * the error handling takes potentially a long time.
  20. *
  21. * It can be very tempting to add handling for obscure cases here.
  22. * In general any code for handling new cases should only be added iff:
  23. * - You know how to test it.
  24. * - You have a test that can be added to mce-test
  25. * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  26. * - The case actually shows up as a frequent (top 10) page state in
  27. * tools/mm/page-types when running a real workload.
  28. *
  29. * There are several operations here with exponential complexity because
  30. * of unsuitable VM data structures. For example the operation to map back
  31. * from RMAP chains to processes has to walk the complete process list and
  32. * has non linear complexity with the number. But since memory corruptions
  33. * are rare we hope to get away with this. This avoids impacting the core
  34. * VM.
  35. */
  36. #define pr_fmt(fmt) "Memory failure: " fmt
  37. #include <linux/kernel.h>
  38. #include <linux/mm.h>
  39. #include <linux/page-flags.h>
  40. #include <linux/sched/signal.h>
  41. #include <linux/sched/task.h>
  42. #include <linux/dax.h>
  43. #include <linux/ksm.h>
  44. #include <linux/rmap.h>
  45. #include <linux/export.h>
  46. #include <linux/pagemap.h>
  47. #include <linux/swap.h>
  48. #include <linux/backing-dev.h>
  49. #include <linux/migrate.h>
  50. #include <linux/slab.h>
  51. #include <linux/swapops.h>
  52. #include <linux/hugetlb.h>
  53. #include <linux/memory_hotplug.h>
  54. #include <linux/mm_inline.h>
  55. #include <linux/memremap.h>
  56. #include <linux/kfifo.h>
  57. #include <linux/ratelimit.h>
  58. #include <linux/pagewalk.h>
  59. #include <linux/shmem_fs.h>
  60. #include <linux/sysctl.h>
  61. #include "swap.h"
  62. #include "internal.h"
  63. #include "ras/ras_event.h"
  64. static int sysctl_memory_failure_early_kill __read_mostly;
  65. static int sysctl_memory_failure_recovery __read_mostly = 1;
  66. static int sysctl_enable_soft_offline __read_mostly = 1;
  67. atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  68. static bool hw_memory_failure __read_mostly = false;
  69. static DEFINE_MUTEX(mf_mutex);
  70. void num_poisoned_pages_inc(unsigned long pfn)
  71. {
  72. atomic_long_inc(&num_poisoned_pages);
  73. memblk_nr_poison_inc(pfn);
  74. }
  75. void num_poisoned_pages_sub(unsigned long pfn, long i)
  76. {
  77. atomic_long_sub(i, &num_poisoned_pages);
  78. if (pfn != -1UL)
  79. memblk_nr_poison_sub(pfn, i);
  80. }
  81. /**
  82. * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
  83. * @_name: name of the file in the per NUMA sysfs directory.
  84. */
  85. #define MF_ATTR_RO(_name) \
  86. static ssize_t _name##_show(struct device *dev, \
  87. struct device_attribute *attr, \
  88. char *buf) \
  89. { \
  90. struct memory_failure_stats *mf_stats = \
  91. &NODE_DATA(dev->id)->mf_stats; \
  92. return sprintf(buf, "%lu\n", mf_stats->_name); \
  93. } \
  94. static DEVICE_ATTR_RO(_name)
  95. MF_ATTR_RO(total);
  96. MF_ATTR_RO(ignored);
  97. MF_ATTR_RO(failed);
  98. MF_ATTR_RO(delayed);
  99. MF_ATTR_RO(recovered);
  100. static struct attribute *memory_failure_attr[] = {
  101. &dev_attr_total.attr,
  102. &dev_attr_ignored.attr,
  103. &dev_attr_failed.attr,
  104. &dev_attr_delayed.attr,
  105. &dev_attr_recovered.attr,
  106. NULL,
  107. };
  108. const struct attribute_group memory_failure_attr_group = {
  109. .name = "memory_failure",
  110. .attrs = memory_failure_attr,
  111. };
  112. static struct ctl_table memory_failure_table[] = {
  113. {
  114. .procname = "memory_failure_early_kill",
  115. .data = &sysctl_memory_failure_early_kill,
  116. .maxlen = sizeof(sysctl_memory_failure_early_kill),
  117. .mode = 0644,
  118. .proc_handler = proc_dointvec_minmax,
  119. .extra1 = SYSCTL_ZERO,
  120. .extra2 = SYSCTL_ONE,
  121. },
  122. {
  123. .procname = "memory_failure_recovery",
  124. .data = &sysctl_memory_failure_recovery,
  125. .maxlen = sizeof(sysctl_memory_failure_recovery),
  126. .mode = 0644,
  127. .proc_handler = proc_dointvec_minmax,
  128. .extra1 = SYSCTL_ZERO,
  129. .extra2 = SYSCTL_ONE,
  130. },
  131. {
  132. .procname = "enable_soft_offline",
  133. .data = &sysctl_enable_soft_offline,
  134. .maxlen = sizeof(sysctl_enable_soft_offline),
  135. .mode = 0644,
  136. .proc_handler = proc_dointvec_minmax,
  137. .extra1 = SYSCTL_ZERO,
  138. .extra2 = SYSCTL_ONE,
  139. }
  140. };
  141. /*
  142. * Return values:
  143. * 1: the page is dissolved (if needed) and taken off from buddy,
  144. * 0: the page is dissolved (if needed) and not taken off from buddy,
  145. * < 0: failed to dissolve.
  146. */
  147. static int __page_handle_poison(struct page *page)
  148. {
  149. int ret;
  150. /*
  151. * zone_pcp_disable() can't be used here. It will
  152. * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
  153. * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
  154. * optimization is enabled. This will break current lock dependency
  155. * chain and leads to deadlock.
  156. * Disabling pcp before dissolving the page was a deterministic
  157. * approach because we made sure that those pages cannot end up in any
  158. * PCP list. Draining PCP lists expels those pages to the buddy system,
  159. * but nothing guarantees that those pages do not get back to a PCP
  160. * queue if we need to refill those.
  161. */
  162. ret = dissolve_free_hugetlb_folio(page_folio(page));
  163. if (!ret) {
  164. drain_all_pages(page_zone(page));
  165. ret = take_page_off_buddy(page);
  166. }
  167. return ret;
  168. }
  169. static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
  170. {
  171. if (hugepage_or_freepage) {
  172. /*
  173. * Doing this check for free pages is also fine since
  174. * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
  175. */
  176. if (__page_handle_poison(page) <= 0)
  177. /*
  178. * We could fail to take off the target page from buddy
  179. * for example due to racy page allocation, but that's
  180. * acceptable because soft-offlined page is not broken
  181. * and if someone really want to use it, they should
  182. * take it.
  183. */
  184. return false;
  185. }
  186. SetPageHWPoison(page);
  187. if (release)
  188. put_page(page);
  189. page_ref_inc(page);
  190. num_poisoned_pages_inc(page_to_pfn(page));
  191. return true;
  192. }
  193. #if IS_ENABLED(CONFIG_HWPOISON_INJECT)
  194. u32 hwpoison_filter_enable = 0;
  195. u32 hwpoison_filter_dev_major = ~0U;
  196. u32 hwpoison_filter_dev_minor = ~0U;
  197. u64 hwpoison_filter_flags_mask;
  198. u64 hwpoison_filter_flags_value;
  199. EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  200. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  201. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  202. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  203. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  204. static int hwpoison_filter_dev(struct page *p)
  205. {
  206. struct folio *folio = page_folio(p);
  207. struct address_space *mapping;
  208. dev_t dev;
  209. if (hwpoison_filter_dev_major == ~0U &&
  210. hwpoison_filter_dev_minor == ~0U)
  211. return 0;
  212. mapping = folio_mapping(folio);
  213. if (mapping == NULL || mapping->host == NULL)
  214. return -EINVAL;
  215. dev = mapping->host->i_sb->s_dev;
  216. if (hwpoison_filter_dev_major != ~0U &&
  217. hwpoison_filter_dev_major != MAJOR(dev))
  218. return -EINVAL;
  219. if (hwpoison_filter_dev_minor != ~0U &&
  220. hwpoison_filter_dev_minor != MINOR(dev))
  221. return -EINVAL;
  222. return 0;
  223. }
  224. static int hwpoison_filter_flags(struct page *p)
  225. {
  226. if (!hwpoison_filter_flags_mask)
  227. return 0;
  228. if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  229. hwpoison_filter_flags_value)
  230. return 0;
  231. else
  232. return -EINVAL;
  233. }
  234. /*
  235. * This allows stress tests to limit test scope to a collection of tasks
  236. * by putting them under some memcg. This prevents killing unrelated/important
  237. * processes such as /sbin/init. Note that the target task may share clean
  238. * pages with init (eg. libc text), which is harmless. If the target task
  239. * share _dirty_ pages with another task B, the test scheme must make sure B
  240. * is also included in the memcg. At last, due to race conditions this filter
  241. * can only guarantee that the page either belongs to the memcg tasks, or is
  242. * a freed page.
  243. */
  244. #ifdef CONFIG_MEMCG
  245. u64 hwpoison_filter_memcg;
  246. EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  247. static int hwpoison_filter_task(struct page *p)
  248. {
  249. if (!hwpoison_filter_memcg)
  250. return 0;
  251. if (page_cgroup_ino(p) != hwpoison_filter_memcg)
  252. return -EINVAL;
  253. return 0;
  254. }
  255. #else
  256. static int hwpoison_filter_task(struct page *p) { return 0; }
  257. #endif
  258. int hwpoison_filter(struct page *p)
  259. {
  260. if (!hwpoison_filter_enable)
  261. return 0;
  262. if (hwpoison_filter_dev(p))
  263. return -EINVAL;
  264. if (hwpoison_filter_flags(p))
  265. return -EINVAL;
  266. if (hwpoison_filter_task(p))
  267. return -EINVAL;
  268. return 0;
  269. }
  270. EXPORT_SYMBOL_GPL(hwpoison_filter);
  271. #else
  272. int hwpoison_filter(struct page *p)
  273. {
  274. return 0;
  275. }
  276. #endif
  277. /*
  278. * Kill all processes that have a poisoned page mapped and then isolate
  279. * the page.
  280. *
  281. * General strategy:
  282. * Find all processes having the page mapped and kill them.
  283. * But we keep a page reference around so that the page is not
  284. * actually freed yet.
  285. * Then stash the page away
  286. *
  287. * There's no convenient way to get back to mapped processes
  288. * from the VMAs. So do a brute-force search over all
  289. * running processes.
  290. *
  291. * Remember that machine checks are not common (or rather
  292. * if they are common you have other problems), so this shouldn't
  293. * be a performance issue.
  294. *
  295. * Also there are some races possible while we get from the
  296. * error detection to actually handle it.
  297. */
  298. struct to_kill {
  299. struct list_head nd;
  300. struct task_struct *tsk;
  301. unsigned long addr;
  302. short size_shift;
  303. };
  304. /*
  305. * Send all the processes who have the page mapped a signal.
  306. * ``action optional'' if they are not immediately affected by the error
  307. * ``action required'' if error happened in current execution context
  308. */
  309. static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  310. {
  311. struct task_struct *t = tk->tsk;
  312. short addr_lsb = tk->size_shift;
  313. int ret = 0;
  314. pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
  315. pfn, t->comm, task_pid_nr(t));
  316. if ((flags & MF_ACTION_REQUIRED) && (t == current))
  317. ret = force_sig_mceerr(BUS_MCEERR_AR,
  318. (void __user *)tk->addr, addr_lsb);
  319. else
  320. /*
  321. * Signal other processes sharing the page if they have
  322. * PF_MCE_EARLY set.
  323. * Don't use force here, it's convenient if the signal
  324. * can be temporarily blocked.
  325. */
  326. ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
  327. addr_lsb, t);
  328. if (ret < 0)
  329. pr_info("Error sending signal to %s:%d: %d\n",
  330. t->comm, task_pid_nr(t), ret);
  331. return ret;
  332. }
  333. /*
  334. * Unknown page type encountered. Try to check whether it can turn PageLRU by
  335. * lru_add_drain_all.
  336. */
  337. void shake_folio(struct folio *folio)
  338. {
  339. if (folio_test_hugetlb(folio))
  340. return;
  341. /*
  342. * TODO: Could shrink slab caches here if a lightweight range-based
  343. * shrinker will be available.
  344. */
  345. if (folio_test_slab(folio))
  346. return;
  347. lru_add_drain_all();
  348. }
  349. EXPORT_SYMBOL_GPL(shake_folio);
  350. static void shake_page(struct page *page)
  351. {
  352. shake_folio(page_folio(page));
  353. }
  354. static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  355. unsigned long address)
  356. {
  357. unsigned long ret = 0;
  358. pgd_t *pgd;
  359. p4d_t *p4d;
  360. pud_t *pud;
  361. pmd_t *pmd;
  362. pte_t *pte;
  363. pte_t ptent;
  364. VM_BUG_ON_VMA(address == -EFAULT, vma);
  365. pgd = pgd_offset(vma->vm_mm, address);
  366. if (!pgd_present(*pgd))
  367. return 0;
  368. p4d = p4d_offset(pgd, address);
  369. if (!p4d_present(*p4d))
  370. return 0;
  371. pud = pud_offset(p4d, address);
  372. if (!pud_present(*pud))
  373. return 0;
  374. if (pud_devmap(*pud))
  375. return PUD_SHIFT;
  376. pmd = pmd_offset(pud, address);
  377. if (!pmd_present(*pmd))
  378. return 0;
  379. if (pmd_devmap(*pmd))
  380. return PMD_SHIFT;
  381. pte = pte_offset_map(pmd, address);
  382. if (!pte)
  383. return 0;
  384. ptent = ptep_get(pte);
  385. if (pte_present(ptent) && pte_devmap(ptent))
  386. ret = PAGE_SHIFT;
  387. pte_unmap(pte);
  388. return ret;
  389. }
  390. /*
  391. * Failure handling: if we can't find or can't kill a process there's
  392. * not much we can do. We just print a message and ignore otherwise.
  393. */
  394. /*
  395. * Schedule a process for later kill.
  396. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  397. */
  398. static void __add_to_kill(struct task_struct *tsk, struct page *p,
  399. struct vm_area_struct *vma, struct list_head *to_kill,
  400. unsigned long addr)
  401. {
  402. struct to_kill *tk;
  403. tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
  404. if (!tk) {
  405. pr_err("Out of memory while machine check handling\n");
  406. return;
  407. }
  408. tk->addr = addr;
  409. if (is_zone_device_page(p))
  410. tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
  411. else
  412. tk->size_shift = page_shift(compound_head(p));
  413. /*
  414. * Send SIGKILL if "tk->addr == -EFAULT". Also, as
  415. * "tk->size_shift" is always non-zero for !is_zone_device_page(),
  416. * so "tk->size_shift == 0" effectively checks no mapping on
  417. * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
  418. * to a process' address space, it's possible not all N VMAs
  419. * contain mappings for the page, but at least one VMA does.
  420. * Only deliver SIGBUS with payload derived from the VMA that
  421. * has a mapping for the page.
  422. */
  423. if (tk->addr == -EFAULT) {
  424. pr_info("Unable to find user space address %lx in %s\n",
  425. page_to_pfn(p), tsk->comm);
  426. } else if (tk->size_shift == 0) {
  427. kfree(tk);
  428. return;
  429. }
  430. get_task_struct(tsk);
  431. tk->tsk = tsk;
  432. list_add_tail(&tk->nd, to_kill);
  433. }
  434. static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
  435. struct vm_area_struct *vma, struct list_head *to_kill,
  436. unsigned long addr)
  437. {
  438. if (addr == -EFAULT)
  439. return;
  440. __add_to_kill(tsk, p, vma, to_kill, addr);
  441. }
  442. #ifdef CONFIG_KSM
  443. static bool task_in_to_kill_list(struct list_head *to_kill,
  444. struct task_struct *tsk)
  445. {
  446. struct to_kill *tk, *next;
  447. list_for_each_entry_safe(tk, next, to_kill, nd) {
  448. if (tk->tsk == tsk)
  449. return true;
  450. }
  451. return false;
  452. }
  453. void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
  454. struct vm_area_struct *vma, struct list_head *to_kill,
  455. unsigned long addr)
  456. {
  457. if (!task_in_to_kill_list(to_kill, tsk))
  458. __add_to_kill(tsk, p, vma, to_kill, addr);
  459. }
  460. #endif
  461. /*
  462. * Kill the processes that have been collected earlier.
  463. *
  464. * Only do anything when FORCEKILL is set, otherwise just free the
  465. * list (this is used for clean pages which do not need killing)
  466. */
  467. static void kill_procs(struct list_head *to_kill, int forcekill,
  468. unsigned long pfn, int flags)
  469. {
  470. struct to_kill *tk, *next;
  471. list_for_each_entry_safe(tk, next, to_kill, nd) {
  472. if (forcekill) {
  473. if (tk->addr == -EFAULT) {
  474. pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
  475. pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
  476. do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
  477. tk->tsk, PIDTYPE_PID);
  478. }
  479. /*
  480. * In theory the process could have mapped
  481. * something else on the address in-between. We could
  482. * check for that, but we need to tell the
  483. * process anyways.
  484. */
  485. else if (kill_proc(tk, pfn, flags) < 0)
  486. pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
  487. pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
  488. }
  489. list_del(&tk->nd);
  490. put_task_struct(tk->tsk);
  491. kfree(tk);
  492. }
  493. }
  494. /*
  495. * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
  496. * on behalf of the thread group. Return task_struct of the (first found)
  497. * dedicated thread if found, and return NULL otherwise.
  498. *
  499. * We already hold rcu lock in the caller, so we don't have to call
  500. * rcu_read_lock/unlock() in this function.
  501. */
  502. static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
  503. {
  504. struct task_struct *t;
  505. for_each_thread(tsk, t) {
  506. if (t->flags & PF_MCE_PROCESS) {
  507. if (t->flags & PF_MCE_EARLY)
  508. return t;
  509. } else {
  510. if (sysctl_memory_failure_early_kill)
  511. return t;
  512. }
  513. }
  514. return NULL;
  515. }
  516. /*
  517. * Determine whether a given process is "early kill" process which expects
  518. * to be signaled when some page under the process is hwpoisoned.
  519. * Return task_struct of the dedicated thread (main thread unless explicitly
  520. * specified) if the process is "early kill" and otherwise returns NULL.
  521. *
  522. * Note that the above is true for Action Optional case. For Action Required
  523. * case, it's only meaningful to the current thread which need to be signaled
  524. * with SIGBUS, this error is Action Optional for other non current
  525. * processes sharing the same error page,if the process is "early kill", the
  526. * task_struct of the dedicated thread will also be returned.
  527. */
  528. struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
  529. {
  530. if (!tsk->mm)
  531. return NULL;
  532. /*
  533. * Comparing ->mm here because current task might represent
  534. * a subthread, while tsk always points to the main thread.
  535. */
  536. if (force_early && tsk->mm == current->mm)
  537. return current;
  538. return find_early_kill_thread(tsk);
  539. }
  540. /*
  541. * Collect processes when the error hit an anonymous page.
  542. */
  543. static void collect_procs_anon(struct folio *folio, struct page *page,
  544. struct list_head *to_kill, int force_early)
  545. {
  546. struct task_struct *tsk;
  547. struct anon_vma *av;
  548. pgoff_t pgoff;
  549. av = folio_lock_anon_vma_read(folio, NULL);
  550. if (av == NULL) /* Not actually mapped anymore */
  551. return;
  552. pgoff = page_to_pgoff(page);
  553. rcu_read_lock();
  554. for_each_process(tsk) {
  555. struct vm_area_struct *vma;
  556. struct anon_vma_chain *vmac;
  557. struct task_struct *t = task_early_kill(tsk, force_early);
  558. unsigned long addr;
  559. if (!t)
  560. continue;
  561. anon_vma_interval_tree_foreach(vmac, &av->rb_root,
  562. pgoff, pgoff) {
  563. vma = vmac->vma;
  564. if (vma->vm_mm != t->mm)
  565. continue;
  566. addr = page_mapped_in_vma(page, vma);
  567. add_to_kill_anon_file(t, page, vma, to_kill, addr);
  568. }
  569. }
  570. rcu_read_unlock();
  571. anon_vma_unlock_read(av);
  572. }
  573. /*
  574. * Collect processes when the error hit a file mapped page.
  575. */
  576. static void collect_procs_file(struct folio *folio, struct page *page,
  577. struct list_head *to_kill, int force_early)
  578. {
  579. struct vm_area_struct *vma;
  580. struct task_struct *tsk;
  581. struct address_space *mapping = folio->mapping;
  582. pgoff_t pgoff;
  583. i_mmap_lock_read(mapping);
  584. rcu_read_lock();
  585. pgoff = page_to_pgoff(page);
  586. for_each_process(tsk) {
  587. struct task_struct *t = task_early_kill(tsk, force_early);
  588. unsigned long addr;
  589. if (!t)
  590. continue;
  591. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
  592. pgoff) {
  593. /*
  594. * Send early kill signal to tasks where a vma covers
  595. * the page but the corrupted page is not necessarily
  596. * mapped in its pte.
  597. * Assume applications who requested early kill want
  598. * to be informed of all such data corruptions.
  599. */
  600. if (vma->vm_mm != t->mm)
  601. continue;
  602. addr = page_address_in_vma(page, vma);
  603. add_to_kill_anon_file(t, page, vma, to_kill, addr);
  604. }
  605. }
  606. rcu_read_unlock();
  607. i_mmap_unlock_read(mapping);
  608. }
  609. #ifdef CONFIG_FS_DAX
  610. static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
  611. struct vm_area_struct *vma,
  612. struct list_head *to_kill, pgoff_t pgoff)
  613. {
  614. unsigned long addr = vma_address(vma, pgoff, 1);
  615. __add_to_kill(tsk, p, vma, to_kill, addr);
  616. }
  617. /*
  618. * Collect processes when the error hit a fsdax page.
  619. */
  620. static void collect_procs_fsdax(struct page *page,
  621. struct address_space *mapping, pgoff_t pgoff,
  622. struct list_head *to_kill, bool pre_remove)
  623. {
  624. struct vm_area_struct *vma;
  625. struct task_struct *tsk;
  626. i_mmap_lock_read(mapping);
  627. rcu_read_lock();
  628. for_each_process(tsk) {
  629. struct task_struct *t = tsk;
  630. /*
  631. * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
  632. * the current may not be the one accessing the fsdax page.
  633. * Otherwise, search for the current task.
  634. */
  635. if (!pre_remove)
  636. t = task_early_kill(tsk, true);
  637. if (!t)
  638. continue;
  639. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  640. if (vma->vm_mm == t->mm)
  641. add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
  642. }
  643. }
  644. rcu_read_unlock();
  645. i_mmap_unlock_read(mapping);
  646. }
  647. #endif /* CONFIG_FS_DAX */
  648. /*
  649. * Collect the processes who have the corrupted page mapped to kill.
  650. */
  651. static void collect_procs(struct folio *folio, struct page *page,
  652. struct list_head *tokill, int force_early)
  653. {
  654. if (!folio->mapping)
  655. return;
  656. if (unlikely(folio_test_ksm(folio)))
  657. collect_procs_ksm(folio, page, tokill, force_early);
  658. else if (folio_test_anon(folio))
  659. collect_procs_anon(folio, page, tokill, force_early);
  660. else
  661. collect_procs_file(folio, page, tokill, force_early);
  662. }
  663. struct hwpoison_walk {
  664. struct to_kill tk;
  665. unsigned long pfn;
  666. int flags;
  667. };
  668. static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
  669. {
  670. tk->addr = addr;
  671. tk->size_shift = shift;
  672. }
  673. static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
  674. unsigned long poisoned_pfn, struct to_kill *tk)
  675. {
  676. unsigned long pfn = 0;
  677. if (pte_present(pte)) {
  678. pfn = pte_pfn(pte);
  679. } else {
  680. swp_entry_t swp = pte_to_swp_entry(pte);
  681. if (is_hwpoison_entry(swp))
  682. pfn = swp_offset_pfn(swp);
  683. }
  684. if (!pfn || pfn != poisoned_pfn)
  685. return 0;
  686. set_to_kill(tk, addr, shift);
  687. return 1;
  688. }
  689. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  690. static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
  691. struct hwpoison_walk *hwp)
  692. {
  693. pmd_t pmd = *pmdp;
  694. unsigned long pfn;
  695. unsigned long hwpoison_vaddr;
  696. if (!pmd_present(pmd))
  697. return 0;
  698. pfn = pmd_pfn(pmd);
  699. if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
  700. hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
  701. set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
  702. return 1;
  703. }
  704. return 0;
  705. }
  706. #else
  707. static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
  708. struct hwpoison_walk *hwp)
  709. {
  710. return 0;
  711. }
  712. #endif
  713. static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
  714. unsigned long end, struct mm_walk *walk)
  715. {
  716. struct hwpoison_walk *hwp = walk->private;
  717. int ret = 0;
  718. pte_t *ptep, *mapped_pte;
  719. spinlock_t *ptl;
  720. ptl = pmd_trans_huge_lock(pmdp, walk->vma);
  721. if (ptl) {
  722. ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
  723. spin_unlock(ptl);
  724. goto out;
  725. }
  726. mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
  727. addr, &ptl);
  728. if (!ptep)
  729. goto out;
  730. for (; addr != end; ptep++, addr += PAGE_SIZE) {
  731. ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
  732. hwp->pfn, &hwp->tk);
  733. if (ret == 1)
  734. break;
  735. }
  736. pte_unmap_unlock(mapped_pte, ptl);
  737. out:
  738. cond_resched();
  739. return ret;
  740. }
  741. #ifdef CONFIG_HUGETLB_PAGE
  742. static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
  743. unsigned long addr, unsigned long end,
  744. struct mm_walk *walk)
  745. {
  746. struct hwpoison_walk *hwp = walk->private;
  747. pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
  748. struct hstate *h = hstate_vma(walk->vma);
  749. return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
  750. hwp->pfn, &hwp->tk);
  751. }
  752. #else
  753. #define hwpoison_hugetlb_range NULL
  754. #endif
  755. static int hwpoison_test_walk(unsigned long start, unsigned long end,
  756. struct mm_walk *walk)
  757. {
  758. /* We also want to consider pages mapped into VM_PFNMAP. */
  759. return 0;
  760. }
  761. static const struct mm_walk_ops hwpoison_walk_ops = {
  762. .pmd_entry = hwpoison_pte_range,
  763. .hugetlb_entry = hwpoison_hugetlb_range,
  764. .test_walk = hwpoison_test_walk,
  765. .walk_lock = PGWALK_RDLOCK,
  766. };
  767. /*
  768. * Sends SIGBUS to the current process with error info.
  769. *
  770. * This function is intended to handle "Action Required" MCEs on already
  771. * hardware poisoned pages. They could happen, for example, when
  772. * memory_failure() failed to unmap the error page at the first call, or
  773. * when multiple local machine checks happened on different CPUs.
  774. *
  775. * MCE handler currently has no easy access to the error virtual address,
  776. * so this function walks page table to find it. The returned virtual address
  777. * is proper in most cases, but it could be wrong when the application
  778. * process has multiple entries mapping the error page.
  779. */
  780. static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
  781. int flags)
  782. {
  783. int ret;
  784. struct hwpoison_walk priv = {
  785. .pfn = pfn,
  786. };
  787. priv.tk.tsk = p;
  788. if (!p->mm)
  789. return -EFAULT;
  790. mmap_read_lock(p->mm);
  791. ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
  792. (void *)&priv);
  793. /*
  794. * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
  795. * succeeds or fails, then kill the process with SIGBUS.
  796. * ret = 0 when poison page is a clean page and it's dropped, no
  797. * SIGBUS is needed.
  798. */
  799. if (ret == 1 && priv.tk.addr)
  800. kill_proc(&priv.tk, pfn, flags);
  801. mmap_read_unlock(p->mm);
  802. return ret > 0 ? -EHWPOISON : 0;
  803. }
  804. /*
  805. * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  806. * But it could not do more to isolate the page from being accessed again,
  807. * nor does it kill the process. This is extremely rare and one of the
  808. * potential causes is that the page state has been changed due to
  809. * underlying race condition. This is the most severe outcomes.
  810. *
  811. * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  812. * It should have killed the process, but it can't isolate the page,
  813. * due to conditions such as extra pin, unmap failure, etc. Accessing
  814. * the page again may trigger another MCE and the process will be killed
  815. * by the m-f() handler immediately.
  816. *
  817. * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  818. * The page is unmapped, and is removed from the LRU or file mapping.
  819. * An attempt to access the page again will trigger page fault and the
  820. * PF handler will kill the process.
  821. *
  822. * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  823. * The page has been completely isolated, that is, unmapped, taken out of
  824. * the buddy system, or hole-punnched out of the file mapping.
  825. */
  826. static const char *action_name[] = {
  827. [MF_IGNORED] = "Ignored",
  828. [MF_FAILED] = "Failed",
  829. [MF_DELAYED] = "Delayed",
  830. [MF_RECOVERED] = "Recovered",
  831. };
  832. static const char * const action_page_types[] = {
  833. [MF_MSG_KERNEL] = "reserved kernel page",
  834. [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
  835. [MF_MSG_HUGE] = "huge page",
  836. [MF_MSG_FREE_HUGE] = "free huge page",
  837. [MF_MSG_GET_HWPOISON] = "get hwpoison page",
  838. [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
  839. [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
  840. [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
  841. [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
  842. [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
  843. [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
  844. [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
  845. [MF_MSG_DIRTY_LRU] = "dirty LRU page",
  846. [MF_MSG_CLEAN_LRU] = "clean LRU page",
  847. [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
  848. [MF_MSG_BUDDY] = "free buddy page",
  849. [MF_MSG_DAX] = "dax page",
  850. [MF_MSG_UNSPLIT_THP] = "unsplit thp",
  851. [MF_MSG_ALREADY_POISONED] = "already poisoned page",
  852. [MF_MSG_UNKNOWN] = "unknown page",
  853. };
  854. /*
  855. * XXX: It is possible that a page is isolated from LRU cache,
  856. * and then kept in swap cache or failed to remove from page cache.
  857. * The page count will stop it from being freed by unpoison.
  858. * Stress tests should be aware of this memory leak problem.
  859. */
  860. static int delete_from_lru_cache(struct folio *folio)
  861. {
  862. if (folio_isolate_lru(folio)) {
  863. /*
  864. * Clear sensible page flags, so that the buddy system won't
  865. * complain when the folio is unpoison-and-freed.
  866. */
  867. folio_clear_active(folio);
  868. folio_clear_unevictable(folio);
  869. /*
  870. * Poisoned page might never drop its ref count to 0 so we have
  871. * to uncharge it manually from its memcg.
  872. */
  873. mem_cgroup_uncharge(folio);
  874. /*
  875. * drop the refcount elevated by folio_isolate_lru()
  876. */
  877. folio_put(folio);
  878. return 0;
  879. }
  880. return -EIO;
  881. }
  882. static int truncate_error_folio(struct folio *folio, unsigned long pfn,
  883. struct address_space *mapping)
  884. {
  885. int ret = MF_FAILED;
  886. if (mapping->a_ops->error_remove_folio) {
  887. int err = mapping->a_ops->error_remove_folio(mapping, folio);
  888. if (err != 0)
  889. pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
  890. else if (!filemap_release_folio(folio, GFP_NOIO))
  891. pr_info("%#lx: failed to release buffers\n", pfn);
  892. else
  893. ret = MF_RECOVERED;
  894. } else {
  895. /*
  896. * If the file system doesn't support it just invalidate
  897. * This fails on dirty or anything with private pages
  898. */
  899. if (mapping_evict_folio(mapping, folio))
  900. ret = MF_RECOVERED;
  901. else
  902. pr_info("%#lx: Failed to invalidate\n", pfn);
  903. }
  904. return ret;
  905. }
  906. struct page_state {
  907. unsigned long mask;
  908. unsigned long res;
  909. enum mf_action_page_type type;
  910. /* Callback ->action() has to unlock the relevant page inside it. */
  911. int (*action)(struct page_state *ps, struct page *p);
  912. };
  913. /*
  914. * Return true if page is still referenced by others, otherwise return
  915. * false.
  916. *
  917. * The extra_pins is true when one extra refcount is expected.
  918. */
  919. static bool has_extra_refcount(struct page_state *ps, struct page *p,
  920. bool extra_pins)
  921. {
  922. int count = page_count(p) - 1;
  923. if (extra_pins)
  924. count -= folio_nr_pages(page_folio(p));
  925. if (count > 0) {
  926. pr_err("%#lx: %s still referenced by %d users\n",
  927. page_to_pfn(p), action_page_types[ps->type], count);
  928. return true;
  929. }
  930. return false;
  931. }
  932. /*
  933. * Error hit kernel page.
  934. * Do nothing, try to be lucky and not touch this instead. For a few cases we
  935. * could be more sophisticated.
  936. */
  937. static int me_kernel(struct page_state *ps, struct page *p)
  938. {
  939. unlock_page(p);
  940. return MF_IGNORED;
  941. }
  942. /*
  943. * Page in unknown state. Do nothing.
  944. * This is a catch-all in case we fail to make sense of the page state.
  945. */
  946. static int me_unknown(struct page_state *ps, struct page *p)
  947. {
  948. pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
  949. unlock_page(p);
  950. return MF_IGNORED;
  951. }
  952. /*
  953. * Clean (or cleaned) page cache page.
  954. */
  955. static int me_pagecache_clean(struct page_state *ps, struct page *p)
  956. {
  957. struct folio *folio = page_folio(p);
  958. int ret;
  959. struct address_space *mapping;
  960. bool extra_pins;
  961. delete_from_lru_cache(folio);
  962. /*
  963. * For anonymous folios the only reference left
  964. * should be the one m_f() holds.
  965. */
  966. if (folio_test_anon(folio)) {
  967. ret = MF_RECOVERED;
  968. goto out;
  969. }
  970. /*
  971. * Now truncate the page in the page cache. This is really
  972. * more like a "temporary hole punch"
  973. * Don't do this for block devices when someone else
  974. * has a reference, because it could be file system metadata
  975. * and that's not safe to truncate.
  976. */
  977. mapping = folio_mapping(folio);
  978. if (!mapping) {
  979. /* Folio has been torn down in the meantime */
  980. ret = MF_FAILED;
  981. goto out;
  982. }
  983. /*
  984. * The shmem page is kept in page cache instead of truncating
  985. * so is expected to have an extra refcount after error-handling.
  986. */
  987. extra_pins = shmem_mapping(mapping);
  988. /*
  989. * Truncation is a bit tricky. Enable it per file system for now.
  990. *
  991. * Open: to take i_rwsem or not for this? Right now we don't.
  992. */
  993. ret = truncate_error_folio(folio, page_to_pfn(p), mapping);
  994. if (has_extra_refcount(ps, p, extra_pins))
  995. ret = MF_FAILED;
  996. out:
  997. folio_unlock(folio);
  998. return ret;
  999. }
  1000. /*
  1001. * Dirty pagecache page
  1002. * Issues: when the error hit a hole page the error is not properly
  1003. * propagated.
  1004. */
  1005. static int me_pagecache_dirty(struct page_state *ps, struct page *p)
  1006. {
  1007. struct folio *folio = page_folio(p);
  1008. struct address_space *mapping = folio_mapping(folio);
  1009. /* TBD: print more information about the file. */
  1010. if (mapping) {
  1011. /*
  1012. * IO error will be reported by write(), fsync(), etc.
  1013. * who check the mapping.
  1014. * This way the application knows that something went
  1015. * wrong with its dirty file data.
  1016. */
  1017. mapping_set_error(mapping, -EIO);
  1018. }
  1019. return me_pagecache_clean(ps, p);
  1020. }
  1021. /*
  1022. * Clean and dirty swap cache.
  1023. *
  1024. * Dirty swap cache page is tricky to handle. The page could live both in page
  1025. * table and swap cache(ie. page is freshly swapped in). So it could be
  1026. * referenced concurrently by 2 types of PTEs:
  1027. * normal PTEs and swap PTEs. We try to handle them consistently by calling
  1028. * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
  1029. * and then
  1030. * - clear dirty bit to prevent IO
  1031. * - remove from LRU
  1032. * - but keep in the swap cache, so that when we return to it on
  1033. * a later page fault, we know the application is accessing
  1034. * corrupted data and shall be killed (we installed simple
  1035. * interception code in do_swap_page to catch it).
  1036. *
  1037. * Clean swap cache pages can be directly isolated. A later page fault will
  1038. * bring in the known good data from disk.
  1039. */
  1040. static int me_swapcache_dirty(struct page_state *ps, struct page *p)
  1041. {
  1042. struct folio *folio = page_folio(p);
  1043. int ret;
  1044. bool extra_pins = false;
  1045. folio_clear_dirty(folio);
  1046. /* Trigger EIO in shmem: */
  1047. folio_clear_uptodate(folio);
  1048. ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED;
  1049. folio_unlock(folio);
  1050. if (ret == MF_DELAYED)
  1051. extra_pins = true;
  1052. if (has_extra_refcount(ps, p, extra_pins))
  1053. ret = MF_FAILED;
  1054. return ret;
  1055. }
  1056. static int me_swapcache_clean(struct page_state *ps, struct page *p)
  1057. {
  1058. struct folio *folio = page_folio(p);
  1059. int ret;
  1060. delete_from_swap_cache(folio);
  1061. ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
  1062. folio_unlock(folio);
  1063. if (has_extra_refcount(ps, p, false))
  1064. ret = MF_FAILED;
  1065. return ret;
  1066. }
  1067. /*
  1068. * Huge pages. Needs work.
  1069. * Issues:
  1070. * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  1071. * To narrow down kill region to one page, we need to break up pmd.
  1072. */
  1073. static int me_huge_page(struct page_state *ps, struct page *p)
  1074. {
  1075. struct folio *folio = page_folio(p);
  1076. int res;
  1077. struct address_space *mapping;
  1078. bool extra_pins = false;
  1079. mapping = folio_mapping(folio);
  1080. if (mapping) {
  1081. res = truncate_error_folio(folio, page_to_pfn(p), mapping);
  1082. /* The page is kept in page cache. */
  1083. extra_pins = true;
  1084. folio_unlock(folio);
  1085. } else {
  1086. folio_unlock(folio);
  1087. /*
  1088. * migration entry prevents later access on error hugepage,
  1089. * so we can free and dissolve it into buddy to save healthy
  1090. * subpages.
  1091. */
  1092. folio_put(folio);
  1093. if (__page_handle_poison(p) > 0) {
  1094. page_ref_inc(p);
  1095. res = MF_RECOVERED;
  1096. } else {
  1097. res = MF_FAILED;
  1098. }
  1099. }
  1100. if (has_extra_refcount(ps, p, extra_pins))
  1101. res = MF_FAILED;
  1102. return res;
  1103. }
  1104. /*
  1105. * Various page states we can handle.
  1106. *
  1107. * A page state is defined by its current page->flags bits.
  1108. * The table matches them in order and calls the right handler.
  1109. *
  1110. * This is quite tricky because we can access page at any time
  1111. * in its live cycle, so all accesses have to be extremely careful.
  1112. *
  1113. * This is not complete. More states could be added.
  1114. * For any missing state don't attempt recovery.
  1115. */
  1116. #define dirty (1UL << PG_dirty)
  1117. #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
  1118. #define unevict (1UL << PG_unevictable)
  1119. #define mlock (1UL << PG_mlocked)
  1120. #define lru (1UL << PG_lru)
  1121. #define head (1UL << PG_head)
  1122. #define reserved (1UL << PG_reserved)
  1123. static struct page_state error_states[] = {
  1124. { reserved, reserved, MF_MSG_KERNEL, me_kernel },
  1125. /*
  1126. * free pages are specially detected outside this table:
  1127. * PG_buddy pages only make a small fraction of all free pages.
  1128. */
  1129. { head, head, MF_MSG_HUGE, me_huge_page },
  1130. { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
  1131. { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
  1132. { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
  1133. { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
  1134. { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
  1135. { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
  1136. { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
  1137. { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
  1138. /*
  1139. * Catchall entry: must be at end.
  1140. */
  1141. { 0, 0, MF_MSG_UNKNOWN, me_unknown },
  1142. };
  1143. #undef dirty
  1144. #undef sc
  1145. #undef unevict
  1146. #undef mlock
  1147. #undef lru
  1148. #undef head
  1149. #undef reserved
  1150. static void update_per_node_mf_stats(unsigned long pfn,
  1151. enum mf_result result)
  1152. {
  1153. int nid = MAX_NUMNODES;
  1154. struct memory_failure_stats *mf_stats = NULL;
  1155. nid = pfn_to_nid(pfn);
  1156. if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
  1157. WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
  1158. return;
  1159. }
  1160. mf_stats = &NODE_DATA(nid)->mf_stats;
  1161. switch (result) {
  1162. case MF_IGNORED:
  1163. ++mf_stats->ignored;
  1164. break;
  1165. case MF_FAILED:
  1166. ++mf_stats->failed;
  1167. break;
  1168. case MF_DELAYED:
  1169. ++mf_stats->delayed;
  1170. break;
  1171. case MF_RECOVERED:
  1172. ++mf_stats->recovered;
  1173. break;
  1174. default:
  1175. WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
  1176. break;
  1177. }
  1178. ++mf_stats->total;
  1179. }
  1180. /*
  1181. * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  1182. * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  1183. */
  1184. static int action_result(unsigned long pfn, enum mf_action_page_type type,
  1185. enum mf_result result)
  1186. {
  1187. trace_memory_failure_event(pfn, type, result);
  1188. if (type != MF_MSG_ALREADY_POISONED) {
  1189. num_poisoned_pages_inc(pfn);
  1190. update_per_node_mf_stats(pfn, result);
  1191. }
  1192. pr_err("%#lx: recovery action for %s: %s\n",
  1193. pfn, action_page_types[type], action_name[result]);
  1194. return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
  1195. }
  1196. static int page_action(struct page_state *ps, struct page *p,
  1197. unsigned long pfn)
  1198. {
  1199. int result;
  1200. /* page p should be unlocked after returning from ps->action(). */
  1201. result = ps->action(ps, p);
  1202. /* Could do more checks here if page looks ok */
  1203. /*
  1204. * Could adjust zone counters here to correct for the missing page.
  1205. */
  1206. return action_result(pfn, ps->type, result);
  1207. }
  1208. static inline bool PageHWPoisonTakenOff(struct page *page)
  1209. {
  1210. return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
  1211. }
  1212. void SetPageHWPoisonTakenOff(struct page *page)
  1213. {
  1214. set_page_private(page, MAGIC_HWPOISON);
  1215. }
  1216. void ClearPageHWPoisonTakenOff(struct page *page)
  1217. {
  1218. if (PageHWPoison(page))
  1219. set_page_private(page, 0);
  1220. }
  1221. /*
  1222. * Return true if a page type of a given page is supported by hwpoison
  1223. * mechanism (while handling could fail), otherwise false. This function
  1224. * does not return true for hugetlb or device memory pages, so it's assumed
  1225. * to be called only in the context where we never have such pages.
  1226. */
  1227. static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
  1228. {
  1229. if (PageSlab(page))
  1230. return false;
  1231. /* Soft offline could migrate non-LRU movable pages */
  1232. if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
  1233. return true;
  1234. return PageLRU(page) || is_free_buddy_page(page);
  1235. }
  1236. static int __get_hwpoison_page(struct page *page, unsigned long flags)
  1237. {
  1238. struct folio *folio = page_folio(page);
  1239. int ret = 0;
  1240. bool hugetlb = false;
  1241. ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
  1242. if (hugetlb) {
  1243. /* Make sure hugetlb demotion did not happen from under us. */
  1244. if (folio == page_folio(page))
  1245. return ret;
  1246. if (ret > 0) {
  1247. folio_put(folio);
  1248. folio = page_folio(page);
  1249. }
  1250. }
  1251. /*
  1252. * This check prevents from calling folio_try_get() for any
  1253. * unsupported type of folio in order to reduce the risk of unexpected
  1254. * races caused by taking a folio refcount.
  1255. */
  1256. if (!HWPoisonHandlable(&folio->page, flags))
  1257. return -EBUSY;
  1258. if (folio_try_get(folio)) {
  1259. if (folio == page_folio(page))
  1260. return 1;
  1261. pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
  1262. folio_put(folio);
  1263. }
  1264. return 0;
  1265. }
  1266. #define GET_PAGE_MAX_RETRY_NUM 3
  1267. static int get_any_page(struct page *p, unsigned long flags)
  1268. {
  1269. int ret = 0, pass = 0;
  1270. bool count_increased = false;
  1271. if (flags & MF_COUNT_INCREASED)
  1272. count_increased = true;
  1273. try_again:
  1274. if (!count_increased) {
  1275. ret = __get_hwpoison_page(p, flags);
  1276. if (!ret) {
  1277. if (page_count(p)) {
  1278. /* We raced with an allocation, retry. */
  1279. if (pass++ < GET_PAGE_MAX_RETRY_NUM)
  1280. goto try_again;
  1281. ret = -EBUSY;
  1282. } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
  1283. /* We raced with put_page, retry. */
  1284. if (pass++ < GET_PAGE_MAX_RETRY_NUM)
  1285. goto try_again;
  1286. ret = -EIO;
  1287. }
  1288. goto out;
  1289. } else if (ret == -EBUSY) {
  1290. /*
  1291. * We raced with (possibly temporary) unhandlable
  1292. * page, retry.
  1293. */
  1294. if (pass++ < 3) {
  1295. shake_page(p);
  1296. goto try_again;
  1297. }
  1298. ret = -EIO;
  1299. goto out;
  1300. }
  1301. }
  1302. if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
  1303. ret = 1;
  1304. } else {
  1305. /*
  1306. * A page we cannot handle. Check whether we can turn
  1307. * it into something we can handle.
  1308. */
  1309. if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
  1310. put_page(p);
  1311. shake_page(p);
  1312. count_increased = false;
  1313. goto try_again;
  1314. }
  1315. put_page(p);
  1316. ret = -EIO;
  1317. }
  1318. out:
  1319. if (ret == -EIO)
  1320. pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
  1321. return ret;
  1322. }
  1323. static int __get_unpoison_page(struct page *page)
  1324. {
  1325. struct folio *folio = page_folio(page);
  1326. int ret = 0;
  1327. bool hugetlb = false;
  1328. ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
  1329. if (hugetlb) {
  1330. /* Make sure hugetlb demotion did not happen from under us. */
  1331. if (folio == page_folio(page))
  1332. return ret;
  1333. if (ret > 0)
  1334. folio_put(folio);
  1335. }
  1336. /*
  1337. * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
  1338. * but also isolated from buddy freelist, so need to identify the
  1339. * state and have to cancel both operations to unpoison.
  1340. */
  1341. if (PageHWPoisonTakenOff(page))
  1342. return -EHWPOISON;
  1343. return get_page_unless_zero(page) ? 1 : 0;
  1344. }
  1345. /**
  1346. * get_hwpoison_page() - Get refcount for memory error handling
  1347. * @p: Raw error page (hit by memory error)
  1348. * @flags: Flags controlling behavior of error handling
  1349. *
  1350. * get_hwpoison_page() takes a page refcount of an error page to handle memory
  1351. * error on it, after checking that the error page is in a well-defined state
  1352. * (defined as a page-type we can successfully handle the memory error on it,
  1353. * such as LRU page and hugetlb page).
  1354. *
  1355. * Memory error handling could be triggered at any time on any type of page,
  1356. * so it's prone to race with typical memory management lifecycle (like
  1357. * allocation and free). So to avoid such races, get_hwpoison_page() takes
  1358. * extra care for the error page's state (as done in __get_hwpoison_page()),
  1359. * and has some retry logic in get_any_page().
  1360. *
  1361. * When called from unpoison_memory(), the caller should already ensure that
  1362. * the given page has PG_hwpoison. So it's never reused for other page
  1363. * allocations, and __get_unpoison_page() never races with them.
  1364. *
  1365. * Return: 0 on failure or free buddy (hugetlb) page,
  1366. * 1 on success for in-use pages in a well-defined state,
  1367. * -EIO for pages on which we can not handle memory errors,
  1368. * -EBUSY when get_hwpoison_page() has raced with page lifecycle
  1369. * operations like allocation and free,
  1370. * -EHWPOISON when the page is hwpoisoned and taken off from buddy.
  1371. */
  1372. static int get_hwpoison_page(struct page *p, unsigned long flags)
  1373. {
  1374. int ret;
  1375. zone_pcp_disable(page_zone(p));
  1376. if (flags & MF_UNPOISON)
  1377. ret = __get_unpoison_page(p);
  1378. else
  1379. ret = get_any_page(p, flags);
  1380. zone_pcp_enable(page_zone(p));
  1381. return ret;
  1382. }
  1383. /*
  1384. * The caller must guarantee the folio isn't large folio, except hugetlb.
  1385. * try_to_unmap() can't handle it.
  1386. */
  1387. int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
  1388. {
  1389. enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
  1390. struct address_space *mapping;
  1391. if (folio_test_swapcache(folio)) {
  1392. pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
  1393. ttu &= ~TTU_HWPOISON;
  1394. }
  1395. /*
  1396. * Propagate the dirty bit from PTEs to struct page first, because we
  1397. * need this to decide if we should kill or just drop the page.
  1398. * XXX: the dirty test could be racy: set_page_dirty() may not always
  1399. * be called inside page lock (it's recommended but not enforced).
  1400. */
  1401. mapping = folio_mapping(folio);
  1402. if (!must_kill && !folio_test_dirty(folio) && mapping &&
  1403. mapping_can_writeback(mapping)) {
  1404. if (folio_mkclean(folio)) {
  1405. folio_set_dirty(folio);
  1406. } else {
  1407. ttu &= ~TTU_HWPOISON;
  1408. pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
  1409. pfn);
  1410. }
  1411. }
  1412. if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
  1413. /*
  1414. * For hugetlb folios in shared mappings, try_to_unmap
  1415. * could potentially call huge_pmd_unshare. Because of
  1416. * this, take semaphore in write mode here and set
  1417. * TTU_RMAP_LOCKED to indicate we have taken the lock
  1418. * at this higher level.
  1419. */
  1420. mapping = hugetlb_folio_mapping_lock_write(folio);
  1421. if (!mapping) {
  1422. pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
  1423. folio_pfn(folio));
  1424. return -EBUSY;
  1425. }
  1426. try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
  1427. i_mmap_unlock_write(mapping);
  1428. } else {
  1429. try_to_unmap(folio, ttu);
  1430. }
  1431. return folio_mapped(folio) ? -EBUSY : 0;
  1432. }
  1433. /*
  1434. * Do all that is necessary to remove user space mappings. Unmap
  1435. * the pages and send SIGBUS to the processes if the data was dirty.
  1436. */
  1437. static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
  1438. unsigned long pfn, int flags)
  1439. {
  1440. LIST_HEAD(tokill);
  1441. bool unmap_success;
  1442. int forcekill;
  1443. bool mlocked = folio_test_mlocked(folio);
  1444. /*
  1445. * Here we are interested only in user-mapped pages, so skip any
  1446. * other types of pages.
  1447. */
  1448. if (folio_test_reserved(folio) || folio_test_slab(folio) ||
  1449. folio_test_pgtable(folio) || folio_test_offline(folio))
  1450. return true;
  1451. if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
  1452. return true;
  1453. /*
  1454. * This check implies we don't kill processes if their pages
  1455. * are in the swap cache early. Those are always late kills.
  1456. */
  1457. if (!folio_mapped(folio))
  1458. return true;
  1459. /*
  1460. * First collect all the processes that have the page
  1461. * mapped in dirty form. This has to be done before try_to_unmap,
  1462. * because ttu takes the rmap data structures down.
  1463. */
  1464. collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
  1465. unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
  1466. if (!unmap_success)
  1467. pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
  1468. pfn, folio_mapcount(folio));
  1469. /*
  1470. * try_to_unmap() might put mlocked page in lru cache, so call
  1471. * shake_page() again to ensure that it's flushed.
  1472. */
  1473. if (mlocked)
  1474. shake_folio(folio);
  1475. /*
  1476. * Now that the dirty bit has been propagated to the
  1477. * struct page and all unmaps done we can decide if
  1478. * killing is needed or not. Only kill when the page
  1479. * was dirty or the process is not restartable,
  1480. * otherwise the tokill list is merely
  1481. * freed. When there was a problem unmapping earlier
  1482. * use a more force-full uncatchable kill to prevent
  1483. * any accesses to the poisoned memory.
  1484. */
  1485. forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
  1486. !unmap_success;
  1487. kill_procs(&tokill, forcekill, pfn, flags);
  1488. return unmap_success;
  1489. }
  1490. static int identify_page_state(unsigned long pfn, struct page *p,
  1491. unsigned long page_flags)
  1492. {
  1493. struct page_state *ps;
  1494. /*
  1495. * The first check uses the current page flags which may not have any
  1496. * relevant information. The second check with the saved page flags is
  1497. * carried out only if the first check can't determine the page status.
  1498. */
  1499. for (ps = error_states;; ps++)
  1500. if ((p->flags & ps->mask) == ps->res)
  1501. break;
  1502. page_flags |= (p->flags & (1UL << PG_dirty));
  1503. if (!ps->mask)
  1504. for (ps = error_states;; ps++)
  1505. if ((page_flags & ps->mask) == ps->res)
  1506. break;
  1507. return page_action(ps, p, pfn);
  1508. }
  1509. /*
  1510. * When 'release' is 'false', it means that if thp split has failed,
  1511. * there is still more to do, hence the page refcount we took earlier
  1512. * is still needed.
  1513. */
  1514. static int try_to_split_thp_page(struct page *page, bool release)
  1515. {
  1516. int ret;
  1517. lock_page(page);
  1518. ret = split_huge_page(page);
  1519. unlock_page(page);
  1520. if (ret && release)
  1521. put_page(page);
  1522. return ret;
  1523. }
  1524. static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
  1525. struct address_space *mapping, pgoff_t index, int flags)
  1526. {
  1527. struct to_kill *tk;
  1528. unsigned long size = 0;
  1529. list_for_each_entry(tk, to_kill, nd)
  1530. if (tk->size_shift)
  1531. size = max(size, 1UL << tk->size_shift);
  1532. if (size) {
  1533. /*
  1534. * Unmap the largest mapping to avoid breaking up device-dax
  1535. * mappings which are constant size. The actual size of the
  1536. * mapping being torn down is communicated in siginfo, see
  1537. * kill_proc()
  1538. */
  1539. loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1);
  1540. unmap_mapping_range(mapping, start, size, 0);
  1541. }
  1542. kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
  1543. }
  1544. /*
  1545. * Only dev_pagemap pages get here, such as fsdax when the filesystem
  1546. * either do not claim or fails to claim a hwpoison event, or devdax.
  1547. * The fsdax pages are initialized per base page, and the devdax pages
  1548. * could be initialized either as base pages, or as compound pages with
  1549. * vmemmap optimization enabled. Devdax is simplistic in its dealing with
  1550. * hwpoison, such that, if a subpage of a compound page is poisoned,
  1551. * simply mark the compound head page is by far sufficient.
  1552. */
  1553. static int mf_generic_kill_procs(unsigned long long pfn, int flags,
  1554. struct dev_pagemap *pgmap)
  1555. {
  1556. struct folio *folio = pfn_folio(pfn);
  1557. LIST_HEAD(to_kill);
  1558. dax_entry_t cookie;
  1559. int rc = 0;
  1560. /*
  1561. * Prevent the inode from being freed while we are interrogating
  1562. * the address_space, typically this would be handled by
  1563. * lock_page(), but dax pages do not use the page lock. This
  1564. * also prevents changes to the mapping of this pfn until
  1565. * poison signaling is complete.
  1566. */
  1567. cookie = dax_lock_folio(folio);
  1568. if (!cookie)
  1569. return -EBUSY;
  1570. if (hwpoison_filter(&folio->page)) {
  1571. rc = -EOPNOTSUPP;
  1572. goto unlock;
  1573. }
  1574. switch (pgmap->type) {
  1575. case MEMORY_DEVICE_PRIVATE:
  1576. case MEMORY_DEVICE_COHERENT:
  1577. /*
  1578. * TODO: Handle device pages which may need coordination
  1579. * with device-side memory.
  1580. */
  1581. rc = -ENXIO;
  1582. goto unlock;
  1583. default:
  1584. break;
  1585. }
  1586. /*
  1587. * Use this flag as an indication that the dax page has been
  1588. * remapped UC to prevent speculative consumption of poison.
  1589. */
  1590. SetPageHWPoison(&folio->page);
  1591. /*
  1592. * Unlike System-RAM there is no possibility to swap in a
  1593. * different physical page at a given virtual address, so all
  1594. * userspace consumption of ZONE_DEVICE memory necessitates
  1595. * SIGBUS (i.e. MF_MUST_KILL)
  1596. */
  1597. flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
  1598. collect_procs(folio, &folio->page, &to_kill, true);
  1599. unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
  1600. unlock:
  1601. dax_unlock_folio(folio, cookie);
  1602. return rc;
  1603. }
  1604. #ifdef CONFIG_FS_DAX
  1605. /**
  1606. * mf_dax_kill_procs - Collect and kill processes who are using this file range
  1607. * @mapping: address_space of the file in use
  1608. * @index: start pgoff of the range within the file
  1609. * @count: length of the range, in unit of PAGE_SIZE
  1610. * @mf_flags: memory failure flags
  1611. */
  1612. int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
  1613. unsigned long count, int mf_flags)
  1614. {
  1615. LIST_HEAD(to_kill);
  1616. dax_entry_t cookie;
  1617. struct page *page;
  1618. size_t end = index + count;
  1619. bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
  1620. mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
  1621. for (; index < end; index++) {
  1622. page = NULL;
  1623. cookie = dax_lock_mapping_entry(mapping, index, &page);
  1624. if (!cookie)
  1625. return -EBUSY;
  1626. if (!page)
  1627. goto unlock;
  1628. if (!pre_remove)
  1629. SetPageHWPoison(page);
  1630. /*
  1631. * The pre_remove case is revoking access, the memory is still
  1632. * good and could theoretically be put back into service.
  1633. */
  1634. collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
  1635. unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
  1636. index, mf_flags);
  1637. unlock:
  1638. dax_unlock_mapping_entry(mapping, index, cookie);
  1639. }
  1640. return 0;
  1641. }
  1642. EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
  1643. #endif /* CONFIG_FS_DAX */
  1644. #ifdef CONFIG_HUGETLB_PAGE
  1645. /*
  1646. * Struct raw_hwp_page represents information about "raw error page",
  1647. * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
  1648. */
  1649. struct raw_hwp_page {
  1650. struct llist_node node;
  1651. struct page *page;
  1652. };
  1653. static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
  1654. {
  1655. return (struct llist_head *)&folio->_hugetlb_hwpoison;
  1656. }
  1657. bool is_raw_hwpoison_page_in_hugepage(struct page *page)
  1658. {
  1659. struct llist_head *raw_hwp_head;
  1660. struct raw_hwp_page *p;
  1661. struct folio *folio = page_folio(page);
  1662. bool ret = false;
  1663. if (!folio_test_hwpoison(folio))
  1664. return false;
  1665. if (!folio_test_hugetlb(folio))
  1666. return PageHWPoison(page);
  1667. /*
  1668. * When RawHwpUnreliable is set, kernel lost track of which subpages
  1669. * are HWPOISON. So return as if ALL subpages are HWPOISONed.
  1670. */
  1671. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1672. return true;
  1673. mutex_lock(&mf_mutex);
  1674. raw_hwp_head = raw_hwp_list_head(folio);
  1675. llist_for_each_entry(p, raw_hwp_head->first, node) {
  1676. if (page == p->page) {
  1677. ret = true;
  1678. break;
  1679. }
  1680. }
  1681. mutex_unlock(&mf_mutex);
  1682. return ret;
  1683. }
  1684. static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
  1685. {
  1686. struct llist_node *head;
  1687. struct raw_hwp_page *p, *next;
  1688. unsigned long count = 0;
  1689. head = llist_del_all(raw_hwp_list_head(folio));
  1690. llist_for_each_entry_safe(p, next, head, node) {
  1691. if (move_flag)
  1692. SetPageHWPoison(p->page);
  1693. else
  1694. num_poisoned_pages_sub(page_to_pfn(p->page), 1);
  1695. kfree(p);
  1696. count++;
  1697. }
  1698. return count;
  1699. }
  1700. static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
  1701. {
  1702. struct llist_head *head;
  1703. struct raw_hwp_page *raw_hwp;
  1704. struct raw_hwp_page *p;
  1705. int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
  1706. /*
  1707. * Once the hwpoison hugepage has lost reliable raw error info,
  1708. * there is little meaning to keep additional error info precisely,
  1709. * so skip to add additional raw error info.
  1710. */
  1711. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1712. return -EHWPOISON;
  1713. head = raw_hwp_list_head(folio);
  1714. llist_for_each_entry(p, head->first, node) {
  1715. if (p->page == page)
  1716. return -EHWPOISON;
  1717. }
  1718. raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
  1719. if (raw_hwp) {
  1720. raw_hwp->page = page;
  1721. llist_add(&raw_hwp->node, head);
  1722. /* the first error event will be counted in action_result(). */
  1723. if (ret)
  1724. num_poisoned_pages_inc(page_to_pfn(page));
  1725. } else {
  1726. /*
  1727. * Failed to save raw error info. We no longer trace all
  1728. * hwpoisoned subpages, and we need refuse to free/dissolve
  1729. * this hwpoisoned hugepage.
  1730. */
  1731. folio_set_hugetlb_raw_hwp_unreliable(folio);
  1732. /*
  1733. * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
  1734. * used any more, so free it.
  1735. */
  1736. __folio_free_raw_hwp(folio, false);
  1737. }
  1738. return ret;
  1739. }
  1740. static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
  1741. {
  1742. /*
  1743. * hugetlb_vmemmap_optimized hugepages can't be freed because struct
  1744. * pages for tail pages are required but they don't exist.
  1745. */
  1746. if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
  1747. return 0;
  1748. /*
  1749. * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
  1750. * definition.
  1751. */
  1752. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1753. return 0;
  1754. return __folio_free_raw_hwp(folio, move_flag);
  1755. }
  1756. void folio_clear_hugetlb_hwpoison(struct folio *folio)
  1757. {
  1758. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1759. return;
  1760. if (folio_test_hugetlb_vmemmap_optimized(folio))
  1761. return;
  1762. folio_clear_hwpoison(folio);
  1763. folio_free_raw_hwp(folio, true);
  1764. }
  1765. /*
  1766. * Called from hugetlb code with hugetlb_lock held.
  1767. *
  1768. * Return values:
  1769. * 0 - free hugepage
  1770. * 1 - in-use hugepage
  1771. * 2 - not a hugepage
  1772. * -EBUSY - the hugepage is busy (try to retry)
  1773. * -EHWPOISON - the hugepage is already hwpoisoned
  1774. */
  1775. int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
  1776. bool *migratable_cleared)
  1777. {
  1778. struct page *page = pfn_to_page(pfn);
  1779. struct folio *folio = page_folio(page);
  1780. int ret = 2; /* fallback to normal page handling */
  1781. bool count_increased = false;
  1782. if (!folio_test_hugetlb(folio))
  1783. goto out;
  1784. if (flags & MF_COUNT_INCREASED) {
  1785. ret = 1;
  1786. count_increased = true;
  1787. } else if (folio_test_hugetlb_freed(folio)) {
  1788. ret = 0;
  1789. } else if (folio_test_hugetlb_migratable(folio)) {
  1790. ret = folio_try_get(folio);
  1791. if (ret)
  1792. count_increased = true;
  1793. } else {
  1794. ret = -EBUSY;
  1795. if (!(flags & MF_NO_RETRY))
  1796. goto out;
  1797. }
  1798. if (folio_set_hugetlb_hwpoison(folio, page)) {
  1799. ret = -EHWPOISON;
  1800. goto out;
  1801. }
  1802. /*
  1803. * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
  1804. * from being migrated by memory hotremove.
  1805. */
  1806. if (count_increased && folio_test_hugetlb_migratable(folio)) {
  1807. folio_clear_hugetlb_migratable(folio);
  1808. *migratable_cleared = true;
  1809. }
  1810. return ret;
  1811. out:
  1812. if (count_increased)
  1813. folio_put(folio);
  1814. return ret;
  1815. }
  1816. /*
  1817. * Taking refcount of hugetlb pages needs extra care about race conditions
  1818. * with basic operations like hugepage allocation/free/demotion.
  1819. * So some of prechecks for hwpoison (pinning, and testing/setting
  1820. * PageHWPoison) should be done in single hugetlb_lock range.
  1821. */
  1822. static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  1823. {
  1824. int res;
  1825. struct page *p = pfn_to_page(pfn);
  1826. struct folio *folio;
  1827. unsigned long page_flags;
  1828. bool migratable_cleared = false;
  1829. *hugetlb = 1;
  1830. retry:
  1831. res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
  1832. if (res == 2) { /* fallback to normal page handling */
  1833. *hugetlb = 0;
  1834. return 0;
  1835. } else if (res == -EHWPOISON) {
  1836. if (flags & MF_ACTION_REQUIRED) {
  1837. folio = page_folio(p);
  1838. res = kill_accessing_process(current, folio_pfn(folio), flags);
  1839. }
  1840. action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
  1841. return res;
  1842. } else if (res == -EBUSY) {
  1843. if (!(flags & MF_NO_RETRY)) {
  1844. flags |= MF_NO_RETRY;
  1845. goto retry;
  1846. }
  1847. return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
  1848. }
  1849. folio = page_folio(p);
  1850. folio_lock(folio);
  1851. if (hwpoison_filter(p)) {
  1852. folio_clear_hugetlb_hwpoison(folio);
  1853. if (migratable_cleared)
  1854. folio_set_hugetlb_migratable(folio);
  1855. folio_unlock(folio);
  1856. if (res == 1)
  1857. folio_put(folio);
  1858. return -EOPNOTSUPP;
  1859. }
  1860. /*
  1861. * Handling free hugepage. The possible race with hugepage allocation
  1862. * or demotion can be prevented by PageHWPoison flag.
  1863. */
  1864. if (res == 0) {
  1865. folio_unlock(folio);
  1866. if (__page_handle_poison(p) > 0) {
  1867. page_ref_inc(p);
  1868. res = MF_RECOVERED;
  1869. } else {
  1870. res = MF_FAILED;
  1871. }
  1872. return action_result(pfn, MF_MSG_FREE_HUGE, res);
  1873. }
  1874. page_flags = folio->flags;
  1875. if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
  1876. folio_unlock(folio);
  1877. return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
  1878. }
  1879. return identify_page_state(pfn, p, page_flags);
  1880. }
  1881. #else
  1882. static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  1883. {
  1884. return 0;
  1885. }
  1886. static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
  1887. {
  1888. return 0;
  1889. }
  1890. #endif /* CONFIG_HUGETLB_PAGE */
  1891. /* Drop the extra refcount in case we come from madvise() */
  1892. static void put_ref_page(unsigned long pfn, int flags)
  1893. {
  1894. if (!(flags & MF_COUNT_INCREASED))
  1895. return;
  1896. put_page(pfn_to_page(pfn));
  1897. }
  1898. static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
  1899. struct dev_pagemap *pgmap)
  1900. {
  1901. int rc = -ENXIO;
  1902. /* device metadata space is not recoverable */
  1903. if (!pgmap_pfn_valid(pgmap, pfn))
  1904. goto out;
  1905. /*
  1906. * Call driver's implementation to handle the memory failure, otherwise
  1907. * fall back to generic handler.
  1908. */
  1909. if (pgmap_has_memory_failure(pgmap)) {
  1910. rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
  1911. /*
  1912. * Fall back to generic handler too if operation is not
  1913. * supported inside the driver/device/filesystem.
  1914. */
  1915. if (rc != -EOPNOTSUPP)
  1916. goto out;
  1917. }
  1918. rc = mf_generic_kill_procs(pfn, flags, pgmap);
  1919. out:
  1920. /* drop pgmap ref acquired in caller */
  1921. put_dev_pagemap(pgmap);
  1922. if (rc != -EOPNOTSUPP)
  1923. action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
  1924. return rc;
  1925. }
  1926. /*
  1927. * The calling condition is as such: thp split failed, page might have
  1928. * been RDMA pinned, not much can be done for recovery.
  1929. * But a SIGBUS should be delivered with vaddr provided so that the user
  1930. * application has a chance to recover. Also, application processes'
  1931. * election for MCE early killed will be honored.
  1932. */
  1933. static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
  1934. struct folio *folio)
  1935. {
  1936. LIST_HEAD(tokill);
  1937. collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
  1938. kill_procs(&tokill, true, pfn, flags);
  1939. }
  1940. /**
  1941. * memory_failure - Handle memory failure of a page.
  1942. * @pfn: Page Number of the corrupted page
  1943. * @flags: fine tune action taken
  1944. *
  1945. * This function is called by the low level machine check code
  1946. * of an architecture when it detects hardware memory corruption
  1947. * of a page. It tries its best to recover, which includes
  1948. * dropping pages, killing processes etc.
  1949. *
  1950. * The function is primarily of use for corruptions that
  1951. * happen outside the current execution context (e.g. when
  1952. * detected by a background scrubber)
  1953. *
  1954. * Must run in process context (e.g. a work queue) with interrupts
  1955. * enabled and no spinlocks held.
  1956. *
  1957. * Return: 0 for successfully handled the memory error,
  1958. * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
  1959. * < 0(except -EOPNOTSUPP) on failure.
  1960. */
  1961. int memory_failure(unsigned long pfn, int flags)
  1962. {
  1963. struct page *p;
  1964. struct folio *folio;
  1965. struct dev_pagemap *pgmap;
  1966. int res = 0;
  1967. unsigned long page_flags;
  1968. bool retry = true;
  1969. int hugetlb = 0;
  1970. if (!sysctl_memory_failure_recovery)
  1971. panic("Memory failure on page %lx", pfn);
  1972. mutex_lock(&mf_mutex);
  1973. if (!(flags & MF_SW_SIMULATED))
  1974. hw_memory_failure = true;
  1975. p = pfn_to_online_page(pfn);
  1976. if (!p) {
  1977. res = arch_memory_failure(pfn, flags);
  1978. if (res == 0)
  1979. goto unlock_mutex;
  1980. if (pfn_valid(pfn)) {
  1981. pgmap = get_dev_pagemap(pfn, NULL);
  1982. put_ref_page(pfn, flags);
  1983. if (pgmap) {
  1984. res = memory_failure_dev_pagemap(pfn, flags,
  1985. pgmap);
  1986. goto unlock_mutex;
  1987. }
  1988. }
  1989. pr_err("%#lx: memory outside kernel control\n", pfn);
  1990. res = -ENXIO;
  1991. goto unlock_mutex;
  1992. }
  1993. try_again:
  1994. res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
  1995. if (hugetlb)
  1996. goto unlock_mutex;
  1997. if (TestSetPageHWPoison(p)) {
  1998. res = -EHWPOISON;
  1999. if (flags & MF_ACTION_REQUIRED)
  2000. res = kill_accessing_process(current, pfn, flags);
  2001. if (flags & MF_COUNT_INCREASED)
  2002. put_page(p);
  2003. action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
  2004. goto unlock_mutex;
  2005. }
  2006. /*
  2007. * We need/can do nothing about count=0 pages.
  2008. * 1) it's a free page, and therefore in safe hand:
  2009. * check_new_page() will be the gate keeper.
  2010. * 2) it's part of a non-compound high order page.
  2011. * Implies some kernel user: cannot stop them from
  2012. * R/W the page; let's pray that the page has been
  2013. * used and will be freed some time later.
  2014. * In fact it's dangerous to directly bump up page count from 0,
  2015. * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
  2016. */
  2017. if (!(flags & MF_COUNT_INCREASED)) {
  2018. res = get_hwpoison_page(p, flags);
  2019. if (!res) {
  2020. if (is_free_buddy_page(p)) {
  2021. if (take_page_off_buddy(p)) {
  2022. page_ref_inc(p);
  2023. res = MF_RECOVERED;
  2024. } else {
  2025. /* We lost the race, try again */
  2026. if (retry) {
  2027. ClearPageHWPoison(p);
  2028. retry = false;
  2029. goto try_again;
  2030. }
  2031. res = MF_FAILED;
  2032. }
  2033. res = action_result(pfn, MF_MSG_BUDDY, res);
  2034. } else {
  2035. res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
  2036. }
  2037. goto unlock_mutex;
  2038. } else if (res < 0) {
  2039. res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
  2040. goto unlock_mutex;
  2041. }
  2042. }
  2043. folio = page_folio(p);
  2044. /* filter pages that are protected from hwpoison test by users */
  2045. folio_lock(folio);
  2046. if (hwpoison_filter(p)) {
  2047. ClearPageHWPoison(p);
  2048. folio_unlock(folio);
  2049. folio_put(folio);
  2050. res = -EOPNOTSUPP;
  2051. goto unlock_mutex;
  2052. }
  2053. folio_unlock(folio);
  2054. if (folio_test_large(folio)) {
  2055. /*
  2056. * The flag must be set after the refcount is bumped
  2057. * otherwise it may race with THP split.
  2058. * And the flag can't be set in get_hwpoison_page() since
  2059. * it is called by soft offline too and it is just called
  2060. * for !MF_COUNT_INCREASED. So here seems to be the best
  2061. * place.
  2062. *
  2063. * Don't need care about the above error handling paths for
  2064. * get_hwpoison_page() since they handle either free page
  2065. * or unhandlable page. The refcount is bumped iff the
  2066. * page is a valid handlable page.
  2067. */
  2068. folio_set_has_hwpoisoned(folio);
  2069. if (try_to_split_thp_page(p, false) < 0) {
  2070. res = -EHWPOISON;
  2071. kill_procs_now(p, pfn, flags, folio);
  2072. put_page(p);
  2073. action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED);
  2074. goto unlock_mutex;
  2075. }
  2076. VM_BUG_ON_PAGE(!page_count(p), p);
  2077. folio = page_folio(p);
  2078. }
  2079. /*
  2080. * We ignore non-LRU pages for good reasons.
  2081. * - PG_locked is only well defined for LRU pages and a few others
  2082. * - to avoid races with __SetPageLocked()
  2083. * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
  2084. * The check (unnecessarily) ignores LRU pages being isolated and
  2085. * walked by the page reclaim code, however that's not a big loss.
  2086. */
  2087. shake_folio(folio);
  2088. folio_lock(folio);
  2089. /*
  2090. * We're only intended to deal with the non-Compound page here.
  2091. * The page cannot become compound pages again as folio has been
  2092. * splited and extra refcnt is held.
  2093. */
  2094. WARN_ON(folio_test_large(folio));
  2095. /*
  2096. * We use page flags to determine what action should be taken, but
  2097. * the flags can be modified by the error containment action. One
  2098. * example is an mlocked page, where PG_mlocked is cleared by
  2099. * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
  2100. * status correctly, we save a copy of the page flags at this time.
  2101. */
  2102. page_flags = folio->flags;
  2103. /*
  2104. * __munlock_folio() may clear a writeback folio's LRU flag without
  2105. * the folio lock. We need to wait for writeback completion for this
  2106. * folio or it may trigger a vfs BUG while evicting inode.
  2107. */
  2108. if (!folio_test_lru(folio) && !folio_test_writeback(folio))
  2109. goto identify_page_state;
  2110. /*
  2111. * It's very difficult to mess with pages currently under IO
  2112. * and in many cases impossible, so we just avoid it here.
  2113. */
  2114. folio_wait_writeback(folio);
  2115. /*
  2116. * Now take care of user space mappings.
  2117. * Abort on fail: __filemap_remove_folio() assumes unmapped page.
  2118. */
  2119. if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
  2120. res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
  2121. goto unlock_page;
  2122. }
  2123. /*
  2124. * Torn down by someone else?
  2125. */
  2126. if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
  2127. folio->mapping == NULL) {
  2128. res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
  2129. goto unlock_page;
  2130. }
  2131. identify_page_state:
  2132. res = identify_page_state(pfn, p, page_flags);
  2133. mutex_unlock(&mf_mutex);
  2134. return res;
  2135. unlock_page:
  2136. folio_unlock(folio);
  2137. unlock_mutex:
  2138. mutex_unlock(&mf_mutex);
  2139. return res;
  2140. }
  2141. EXPORT_SYMBOL_GPL(memory_failure);
  2142. #define MEMORY_FAILURE_FIFO_ORDER 4
  2143. #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
  2144. struct memory_failure_entry {
  2145. unsigned long pfn;
  2146. int flags;
  2147. };
  2148. struct memory_failure_cpu {
  2149. DECLARE_KFIFO(fifo, struct memory_failure_entry,
  2150. MEMORY_FAILURE_FIFO_SIZE);
  2151. raw_spinlock_t lock;
  2152. struct work_struct work;
  2153. };
  2154. static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
  2155. /**
  2156. * memory_failure_queue - Schedule handling memory failure of a page.
  2157. * @pfn: Page Number of the corrupted page
  2158. * @flags: Flags for memory failure handling
  2159. *
  2160. * This function is called by the low level hardware error handler
  2161. * when it detects hardware memory corruption of a page. It schedules
  2162. * the recovering of error page, including dropping pages, killing
  2163. * processes etc.
  2164. *
  2165. * The function is primarily of use for corruptions that
  2166. * happen outside the current execution context (e.g. when
  2167. * detected by a background scrubber)
  2168. *
  2169. * Can run in IRQ context.
  2170. */
  2171. void memory_failure_queue(unsigned long pfn, int flags)
  2172. {
  2173. struct memory_failure_cpu *mf_cpu;
  2174. unsigned long proc_flags;
  2175. bool buffer_overflow;
  2176. struct memory_failure_entry entry = {
  2177. .pfn = pfn,
  2178. .flags = flags,
  2179. };
  2180. mf_cpu = &get_cpu_var(memory_failure_cpu);
  2181. raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
  2182. buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry);
  2183. if (!buffer_overflow)
  2184. schedule_work_on(smp_processor_id(), &mf_cpu->work);
  2185. raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
  2186. put_cpu_var(memory_failure_cpu);
  2187. if (buffer_overflow)
  2188. pr_err("buffer overflow when queuing memory failure at %#lx\n",
  2189. pfn);
  2190. }
  2191. EXPORT_SYMBOL_GPL(memory_failure_queue);
  2192. static void memory_failure_work_func(struct work_struct *work)
  2193. {
  2194. struct memory_failure_cpu *mf_cpu;
  2195. struct memory_failure_entry entry = { 0, };
  2196. unsigned long proc_flags;
  2197. int gotten;
  2198. mf_cpu = container_of(work, struct memory_failure_cpu, work);
  2199. for (;;) {
  2200. raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
  2201. gotten = kfifo_get(&mf_cpu->fifo, &entry);
  2202. raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
  2203. if (!gotten)
  2204. break;
  2205. if (entry.flags & MF_SOFT_OFFLINE)
  2206. soft_offline_page(entry.pfn, entry.flags);
  2207. else
  2208. memory_failure(entry.pfn, entry.flags);
  2209. }
  2210. }
  2211. /*
  2212. * Process memory_failure work queued on the specified CPU.
  2213. * Used to avoid return-to-userspace racing with the memory_failure workqueue.
  2214. */
  2215. void memory_failure_queue_kick(int cpu)
  2216. {
  2217. struct memory_failure_cpu *mf_cpu;
  2218. mf_cpu = &per_cpu(memory_failure_cpu, cpu);
  2219. cancel_work_sync(&mf_cpu->work);
  2220. memory_failure_work_func(&mf_cpu->work);
  2221. }
  2222. static int __init memory_failure_init(void)
  2223. {
  2224. struct memory_failure_cpu *mf_cpu;
  2225. int cpu;
  2226. for_each_possible_cpu(cpu) {
  2227. mf_cpu = &per_cpu(memory_failure_cpu, cpu);
  2228. raw_spin_lock_init(&mf_cpu->lock);
  2229. INIT_KFIFO(mf_cpu->fifo);
  2230. INIT_WORK(&mf_cpu->work, memory_failure_work_func);
  2231. }
  2232. register_sysctl_init("vm", memory_failure_table);
  2233. return 0;
  2234. }
  2235. core_initcall(memory_failure_init);
  2236. #undef pr_fmt
  2237. #define pr_fmt(fmt) "Unpoison: " fmt
  2238. #define unpoison_pr_info(fmt, pfn, rs) \
  2239. ({ \
  2240. if (__ratelimit(rs)) \
  2241. pr_info(fmt, pfn); \
  2242. })
  2243. /**
  2244. * unpoison_memory - Unpoison a previously poisoned page
  2245. * @pfn: Page number of the to be unpoisoned page
  2246. *
  2247. * Software-unpoison a page that has been poisoned by
  2248. * memory_failure() earlier.
  2249. *
  2250. * This is only done on the software-level, so it only works
  2251. * for linux injected failures, not real hardware failures
  2252. *
  2253. * Returns 0 for success, otherwise -errno.
  2254. */
  2255. int unpoison_memory(unsigned long pfn)
  2256. {
  2257. struct folio *folio;
  2258. struct page *p;
  2259. int ret = -EBUSY, ghp;
  2260. unsigned long count;
  2261. bool huge = false;
  2262. static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
  2263. DEFAULT_RATELIMIT_BURST);
  2264. p = pfn_to_online_page(pfn);
  2265. if (!p)
  2266. return -EIO;
  2267. folio = page_folio(p);
  2268. mutex_lock(&mf_mutex);
  2269. if (hw_memory_failure) {
  2270. unpoison_pr_info("%#lx: disabled after HW memory failure\n",
  2271. pfn, &unpoison_rs);
  2272. ret = -EOPNOTSUPP;
  2273. goto unlock_mutex;
  2274. }
  2275. if (is_huge_zero_folio(folio)) {
  2276. unpoison_pr_info("%#lx: huge zero page is not supported\n",
  2277. pfn, &unpoison_rs);
  2278. ret = -EOPNOTSUPP;
  2279. goto unlock_mutex;
  2280. }
  2281. if (!PageHWPoison(p)) {
  2282. unpoison_pr_info("%#lx: page was already unpoisoned\n",
  2283. pfn, &unpoison_rs);
  2284. goto unlock_mutex;
  2285. }
  2286. if (folio_ref_count(folio) > 1) {
  2287. unpoison_pr_info("%#lx: someone grabs the hwpoison page\n",
  2288. pfn, &unpoison_rs);
  2289. goto unlock_mutex;
  2290. }
  2291. if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
  2292. folio_test_reserved(folio) || folio_test_offline(folio))
  2293. goto unlock_mutex;
  2294. if (folio_mapped(folio)) {
  2295. unpoison_pr_info("%#lx: someone maps the hwpoison page\n",
  2296. pfn, &unpoison_rs);
  2297. goto unlock_mutex;
  2298. }
  2299. if (folio_mapping(folio)) {
  2300. unpoison_pr_info("%#lx: the hwpoison page has non-NULL mapping\n",
  2301. pfn, &unpoison_rs);
  2302. goto unlock_mutex;
  2303. }
  2304. ghp = get_hwpoison_page(p, MF_UNPOISON);
  2305. if (!ghp) {
  2306. if (folio_test_hugetlb(folio)) {
  2307. huge = true;
  2308. count = folio_free_raw_hwp(folio, false);
  2309. if (count == 0)
  2310. goto unlock_mutex;
  2311. }
  2312. ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
  2313. } else if (ghp < 0) {
  2314. if (ghp == -EHWPOISON) {
  2315. ret = put_page_back_buddy(p) ? 0 : -EBUSY;
  2316. } else {
  2317. ret = ghp;
  2318. unpoison_pr_info("%#lx: failed to grab page\n",
  2319. pfn, &unpoison_rs);
  2320. }
  2321. } else {
  2322. if (folio_test_hugetlb(folio)) {
  2323. huge = true;
  2324. count = folio_free_raw_hwp(folio, false);
  2325. if (count == 0) {
  2326. folio_put(folio);
  2327. goto unlock_mutex;
  2328. }
  2329. }
  2330. folio_put(folio);
  2331. if (TestClearPageHWPoison(p)) {
  2332. folio_put(folio);
  2333. ret = 0;
  2334. }
  2335. }
  2336. unlock_mutex:
  2337. mutex_unlock(&mf_mutex);
  2338. if (!ret) {
  2339. if (!huge)
  2340. num_poisoned_pages_sub(pfn, 1);
  2341. unpoison_pr_info("%#lx: software-unpoisoned page\n",
  2342. page_to_pfn(p), &unpoison_rs);
  2343. }
  2344. return ret;
  2345. }
  2346. EXPORT_SYMBOL(unpoison_memory);
  2347. #undef pr_fmt
  2348. #define pr_fmt(fmt) "Soft offline: " fmt
  2349. /*
  2350. * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
  2351. * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
  2352. * If the page is mapped, it migrates the contents over.
  2353. */
  2354. static int soft_offline_in_use_page(struct page *page)
  2355. {
  2356. long ret = 0;
  2357. unsigned long pfn = page_to_pfn(page);
  2358. struct folio *folio = page_folio(page);
  2359. char const *msg_page[] = {"page", "hugepage"};
  2360. bool huge = folio_test_hugetlb(folio);
  2361. bool isolated;
  2362. LIST_HEAD(pagelist);
  2363. struct migration_target_control mtc = {
  2364. .nid = NUMA_NO_NODE,
  2365. .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
  2366. .reason = MR_MEMORY_FAILURE,
  2367. };
  2368. if (!huge && folio_test_large(folio)) {
  2369. if (try_to_split_thp_page(page, true)) {
  2370. pr_info("%#lx: thp split failed\n", pfn);
  2371. return -EBUSY;
  2372. }
  2373. folio = page_folio(page);
  2374. }
  2375. folio_lock(folio);
  2376. if (!huge)
  2377. folio_wait_writeback(folio);
  2378. if (PageHWPoison(page)) {
  2379. folio_unlock(folio);
  2380. folio_put(folio);
  2381. pr_info("%#lx: page already poisoned\n", pfn);
  2382. return 0;
  2383. }
  2384. if (!huge && folio_test_lru(folio) && !folio_test_swapcache(folio))
  2385. /*
  2386. * Try to invalidate first. This should work for
  2387. * non dirty unmapped page cache pages.
  2388. */
  2389. ret = mapping_evict_folio(folio_mapping(folio), folio);
  2390. folio_unlock(folio);
  2391. if (ret) {
  2392. pr_info("%#lx: invalidated\n", pfn);
  2393. page_handle_poison(page, false, true);
  2394. return 0;
  2395. }
  2396. isolated = isolate_folio_to_list(folio, &pagelist);
  2397. /*
  2398. * If we succeed to isolate the folio, we grabbed another refcount on
  2399. * the folio, so we can safely drop the one we got from get_any_page().
  2400. * If we failed to isolate the folio, it means that we cannot go further
  2401. * and we will return an error, so drop the reference we got from
  2402. * get_any_page() as well.
  2403. */
  2404. folio_put(folio);
  2405. if (isolated) {
  2406. ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
  2407. (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
  2408. if (!ret) {
  2409. bool release = !huge;
  2410. if (!page_handle_poison(page, huge, release))
  2411. ret = -EBUSY;
  2412. } else {
  2413. if (!list_empty(&pagelist))
  2414. putback_movable_pages(&pagelist);
  2415. pr_info("%#lx: %s migration failed %ld, type %pGp\n",
  2416. pfn, msg_page[huge], ret, &page->flags);
  2417. if (ret > 0)
  2418. ret = -EBUSY;
  2419. }
  2420. } else {
  2421. pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
  2422. pfn, msg_page[huge], page_count(page), &page->flags);
  2423. ret = -EBUSY;
  2424. }
  2425. return ret;
  2426. }
  2427. /**
  2428. * soft_offline_page - Soft offline a page.
  2429. * @pfn: pfn to soft-offline
  2430. * @flags: flags. Same as memory_failure().
  2431. *
  2432. * Returns 0 on success,
  2433. * -EOPNOTSUPP for hwpoison_filter() filtered the error event, or
  2434. * disabled by /proc/sys/vm/enable_soft_offline,
  2435. * < 0 otherwise negated errno.
  2436. *
  2437. * Soft offline a page, by migration or invalidation,
  2438. * without killing anything. This is for the case when
  2439. * a page is not corrupted yet (so it's still valid to access),
  2440. * but has had a number of corrected errors and is better taken
  2441. * out.
  2442. *
  2443. * The actual policy on when to do that is maintained by
  2444. * user space.
  2445. *
  2446. * This should never impact any application or cause data loss,
  2447. * however it might take some time.
  2448. *
  2449. * This is not a 100% solution for all memory, but tries to be
  2450. * ``good enough'' for the majority of memory.
  2451. */
  2452. int soft_offline_page(unsigned long pfn, int flags)
  2453. {
  2454. int ret;
  2455. bool try_again = true;
  2456. struct page *page;
  2457. if (!pfn_valid(pfn)) {
  2458. WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
  2459. return -ENXIO;
  2460. }
  2461. /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
  2462. page = pfn_to_online_page(pfn);
  2463. if (!page) {
  2464. put_ref_page(pfn, flags);
  2465. return -EIO;
  2466. }
  2467. if (!sysctl_enable_soft_offline) {
  2468. pr_info_once("disabled by /proc/sys/vm/enable_soft_offline\n");
  2469. put_ref_page(pfn, flags);
  2470. return -EOPNOTSUPP;
  2471. }
  2472. mutex_lock(&mf_mutex);
  2473. if (PageHWPoison(page)) {
  2474. pr_info("%#lx: page already poisoned\n", pfn);
  2475. put_ref_page(pfn, flags);
  2476. mutex_unlock(&mf_mutex);
  2477. return 0;
  2478. }
  2479. retry:
  2480. get_online_mems();
  2481. ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
  2482. put_online_mems();
  2483. if (hwpoison_filter(page)) {
  2484. if (ret > 0)
  2485. put_page(page);
  2486. mutex_unlock(&mf_mutex);
  2487. return -EOPNOTSUPP;
  2488. }
  2489. if (ret > 0) {
  2490. ret = soft_offline_in_use_page(page);
  2491. } else if (ret == 0) {
  2492. if (!page_handle_poison(page, true, false)) {
  2493. if (try_again) {
  2494. try_again = false;
  2495. flags &= ~MF_COUNT_INCREASED;
  2496. goto retry;
  2497. }
  2498. ret = -EBUSY;
  2499. }
  2500. }
  2501. mutex_unlock(&mf_mutex);
  2502. return ret;
  2503. }