memory-failure.c 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2008, 2009 Intel Corporation
  4. * Authors: Andi Kleen, Fengguang Wu
  5. *
  6. * High level machine check handler. Handles pages reported by the
  7. * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  8. * failure.
  9. *
  10. * In addition there is a "soft offline" entry point that allows stop using
  11. * not-yet-corrupted-by-suspicious pages without killing anything.
  12. *
  13. * Handles page cache pages in various states. The tricky part
  14. * here is that we can access any page asynchronously in respect to
  15. * other VM users, because memory failures could happen anytime and
  16. * anywhere. This could violate some of their assumptions. This is why
  17. * this code has to be extremely careful. Generally it tries to use
  18. * normal locking rules, as in get the standard locks, even if that means
  19. * the error handling takes potentially a long time.
  20. *
  21. * It can be very tempting to add handling for obscure cases here.
  22. * In general any code for handling new cases should only be added iff:
  23. * - You know how to test it.
  24. * - You have a test that can be added to mce-test
  25. * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  26. * - The case actually shows up as a frequent (top 10) page state in
  27. * tools/mm/page-types when running a real workload.
  28. *
  29. * There are several operations here with exponential complexity because
  30. * of unsuitable VM data structures. For example the operation to map back
  31. * from RMAP chains to processes has to walk the complete process list and
  32. * has non linear complexity with the number. But since memory corruptions
  33. * are rare we hope to get away with this. This avoids impacting the core
  34. * VM.
  35. */
  36. #define pr_fmt(fmt) "Memory failure: " fmt
  37. #include <linux/kernel.h>
  38. #include <linux/mm.h>
  39. #include <linux/page-flags.h>
  40. #include <linux/sched/signal.h>
  41. #include <linux/sched/task.h>
  42. #include <linux/dax.h>
  43. #include <linux/ksm.h>
  44. #include <linux/rmap.h>
  45. #include <linux/export.h>
  46. #include <linux/pagemap.h>
  47. #include <linux/swap.h>
  48. #include <linux/backing-dev.h>
  49. #include <linux/migrate.h>
  50. #include <linux/slab.h>
  51. #include <linux/swapops.h>
  52. #include <linux/hugetlb.h>
  53. #include <linux/memory_hotplug.h>
  54. #include <linux/mm_inline.h>
  55. #include <linux/memremap.h>
  56. #include <linux/kfifo.h>
  57. #include <linux/ratelimit.h>
  58. #include <linux/pagewalk.h>
  59. #include <linux/shmem_fs.h>
  60. #include <linux/sysctl.h>
  61. #include "swap.h"
  62. #include "internal.h"
  63. #include "ras/ras_event.h"
  64. static int sysctl_memory_failure_early_kill __read_mostly;
  65. static int sysctl_memory_failure_recovery __read_mostly = 1;
  66. static int sysctl_enable_soft_offline __read_mostly = 1;
  67. atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  68. static bool hw_memory_failure __read_mostly = false;
  69. static DEFINE_MUTEX(mf_mutex);
  70. void num_poisoned_pages_inc(unsigned long pfn)
  71. {
  72. atomic_long_inc(&num_poisoned_pages);
  73. memblk_nr_poison_inc(pfn);
  74. }
  75. void num_poisoned_pages_sub(unsigned long pfn, long i)
  76. {
  77. atomic_long_sub(i, &num_poisoned_pages);
  78. if (pfn != -1UL)
  79. memblk_nr_poison_sub(pfn, i);
  80. }
  81. /**
  82. * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
  83. * @_name: name of the file in the per NUMA sysfs directory.
  84. */
  85. #define MF_ATTR_RO(_name) \
  86. static ssize_t _name##_show(struct device *dev, \
  87. struct device_attribute *attr, \
  88. char *buf) \
  89. { \
  90. struct memory_failure_stats *mf_stats = \
  91. &NODE_DATA(dev->id)->mf_stats; \
  92. return sprintf(buf, "%lu\n", mf_stats->_name); \
  93. } \
  94. static DEVICE_ATTR_RO(_name)
  95. MF_ATTR_RO(total);
  96. MF_ATTR_RO(ignored);
  97. MF_ATTR_RO(failed);
  98. MF_ATTR_RO(delayed);
  99. MF_ATTR_RO(recovered);
  100. static struct attribute *memory_failure_attr[] = {
  101. &dev_attr_total.attr,
  102. &dev_attr_ignored.attr,
  103. &dev_attr_failed.attr,
  104. &dev_attr_delayed.attr,
  105. &dev_attr_recovered.attr,
  106. NULL,
  107. };
  108. const struct attribute_group memory_failure_attr_group = {
  109. .name = "memory_failure",
  110. .attrs = memory_failure_attr,
  111. };
  112. static struct ctl_table memory_failure_table[] = {
  113. {
  114. .procname = "memory_failure_early_kill",
  115. .data = &sysctl_memory_failure_early_kill,
  116. .maxlen = sizeof(sysctl_memory_failure_early_kill),
  117. .mode = 0644,
  118. .proc_handler = proc_dointvec_minmax,
  119. .extra1 = SYSCTL_ZERO,
  120. .extra2 = SYSCTL_ONE,
  121. },
  122. {
  123. .procname = "memory_failure_recovery",
  124. .data = &sysctl_memory_failure_recovery,
  125. .maxlen = sizeof(sysctl_memory_failure_recovery),
  126. .mode = 0644,
  127. .proc_handler = proc_dointvec_minmax,
  128. .extra1 = SYSCTL_ZERO,
  129. .extra2 = SYSCTL_ONE,
  130. },
  131. {
  132. .procname = "enable_soft_offline",
  133. .data = &sysctl_enable_soft_offline,
  134. .maxlen = sizeof(sysctl_enable_soft_offline),
  135. .mode = 0644,
  136. .proc_handler = proc_dointvec_minmax,
  137. .extra1 = SYSCTL_ZERO,
  138. .extra2 = SYSCTL_ONE,
  139. }
  140. };
  141. /*
  142. * Return values:
  143. * 1: the page is dissolved (if needed) and taken off from buddy,
  144. * 0: the page is dissolved (if needed) and not taken off from buddy,
  145. * < 0: failed to dissolve.
  146. */
  147. static int __page_handle_poison(struct page *page)
  148. {
  149. int ret;
  150. /*
  151. * zone_pcp_disable() can't be used here. It will
  152. * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
  153. * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
  154. * optimization is enabled. This will break current lock dependency
  155. * chain and leads to deadlock.
  156. * Disabling pcp before dissolving the page was a deterministic
  157. * approach because we made sure that those pages cannot end up in any
  158. * PCP list. Draining PCP lists expels those pages to the buddy system,
  159. * but nothing guarantees that those pages do not get back to a PCP
  160. * queue if we need to refill those.
  161. */
  162. ret = dissolve_free_hugetlb_folio(page_folio(page));
  163. if (!ret) {
  164. drain_all_pages(page_zone(page));
  165. ret = take_page_off_buddy(page);
  166. }
  167. return ret;
  168. }
  169. static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
  170. {
  171. if (hugepage_or_freepage) {
  172. /*
  173. * Doing this check for free pages is also fine since
  174. * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
  175. */
  176. if (__page_handle_poison(page) <= 0)
  177. /*
  178. * We could fail to take off the target page from buddy
  179. * for example due to racy page allocation, but that's
  180. * acceptable because soft-offlined page is not broken
  181. * and if someone really want to use it, they should
  182. * take it.
  183. */
  184. return false;
  185. }
  186. SetPageHWPoison(page);
  187. if (release)
  188. put_page(page);
  189. page_ref_inc(page);
  190. num_poisoned_pages_inc(page_to_pfn(page));
  191. return true;
  192. }
  193. #if IS_ENABLED(CONFIG_HWPOISON_INJECT)
  194. u32 hwpoison_filter_enable = 0;
  195. u32 hwpoison_filter_dev_major = ~0U;
  196. u32 hwpoison_filter_dev_minor = ~0U;
  197. u64 hwpoison_filter_flags_mask;
  198. u64 hwpoison_filter_flags_value;
  199. EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  200. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  201. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  202. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  203. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  204. static int hwpoison_filter_dev(struct page *p)
  205. {
  206. struct folio *folio = page_folio(p);
  207. struct address_space *mapping;
  208. dev_t dev;
  209. if (hwpoison_filter_dev_major == ~0U &&
  210. hwpoison_filter_dev_minor == ~0U)
  211. return 0;
  212. mapping = folio_mapping(folio);
  213. if (mapping == NULL || mapping->host == NULL)
  214. return -EINVAL;
  215. dev = mapping->host->i_sb->s_dev;
  216. if (hwpoison_filter_dev_major != ~0U &&
  217. hwpoison_filter_dev_major != MAJOR(dev))
  218. return -EINVAL;
  219. if (hwpoison_filter_dev_minor != ~0U &&
  220. hwpoison_filter_dev_minor != MINOR(dev))
  221. return -EINVAL;
  222. return 0;
  223. }
  224. static int hwpoison_filter_flags(struct page *p)
  225. {
  226. if (!hwpoison_filter_flags_mask)
  227. return 0;
  228. if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  229. hwpoison_filter_flags_value)
  230. return 0;
  231. else
  232. return -EINVAL;
  233. }
  234. /*
  235. * This allows stress tests to limit test scope to a collection of tasks
  236. * by putting them under some memcg. This prevents killing unrelated/important
  237. * processes such as /sbin/init. Note that the target task may share clean
  238. * pages with init (eg. libc text), which is harmless. If the target task
  239. * share _dirty_ pages with another task B, the test scheme must make sure B
  240. * is also included in the memcg. At last, due to race conditions this filter
  241. * can only guarantee that the page either belongs to the memcg tasks, or is
  242. * a freed page.
  243. */
  244. #ifdef CONFIG_MEMCG
  245. u64 hwpoison_filter_memcg;
  246. EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  247. static int hwpoison_filter_task(struct page *p)
  248. {
  249. if (!hwpoison_filter_memcg)
  250. return 0;
  251. if (page_cgroup_ino(p) != hwpoison_filter_memcg)
  252. return -EINVAL;
  253. return 0;
  254. }
  255. #else
  256. static int hwpoison_filter_task(struct page *p) { return 0; }
  257. #endif
  258. int hwpoison_filter(struct page *p)
  259. {
  260. if (!hwpoison_filter_enable)
  261. return 0;
  262. if (hwpoison_filter_dev(p))
  263. return -EINVAL;
  264. if (hwpoison_filter_flags(p))
  265. return -EINVAL;
  266. if (hwpoison_filter_task(p))
  267. return -EINVAL;
  268. return 0;
  269. }
  270. EXPORT_SYMBOL_GPL(hwpoison_filter);
  271. #else
  272. int hwpoison_filter(struct page *p)
  273. {
  274. return 0;
  275. }
  276. #endif
  277. /*
  278. * Kill all processes that have a poisoned page mapped and then isolate
  279. * the page.
  280. *
  281. * General strategy:
  282. * Find all processes having the page mapped and kill them.
  283. * But we keep a page reference around so that the page is not
  284. * actually freed yet.
  285. * Then stash the page away
  286. *
  287. * There's no convenient way to get back to mapped processes
  288. * from the VMAs. So do a brute-force search over all
  289. * running processes.
  290. *
  291. * Remember that machine checks are not common (or rather
  292. * if they are common you have other problems), so this shouldn't
  293. * be a performance issue.
  294. *
  295. * Also there are some races possible while we get from the
  296. * error detection to actually handle it.
  297. */
  298. struct to_kill {
  299. struct list_head nd;
  300. struct task_struct *tsk;
  301. unsigned long addr;
  302. short size_shift;
  303. };
  304. /*
  305. * Send all the processes who have the page mapped a signal.
  306. * ``action optional'' if they are not immediately affected by the error
  307. * ``action required'' if error happened in current execution context
  308. */
  309. static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  310. {
  311. struct task_struct *t = tk->tsk;
  312. short addr_lsb = tk->size_shift;
  313. int ret = 0;
  314. pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
  315. pfn, t->comm, task_pid_nr(t));
  316. if ((flags & MF_ACTION_REQUIRED) && (t == current))
  317. ret = force_sig_mceerr(BUS_MCEERR_AR,
  318. (void __user *)tk->addr, addr_lsb);
  319. else
  320. /*
  321. * Signal other processes sharing the page if they have
  322. * PF_MCE_EARLY set.
  323. * Don't use force here, it's convenient if the signal
  324. * can be temporarily blocked.
  325. */
  326. ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
  327. addr_lsb, t);
  328. if (ret < 0)
  329. pr_info("Error sending signal to %s:%d: %d\n",
  330. t->comm, task_pid_nr(t), ret);
  331. return ret;
  332. }
  333. /*
  334. * Unknown page type encountered. Try to check whether it can turn PageLRU by
  335. * lru_add_drain_all.
  336. */
  337. void shake_folio(struct folio *folio)
  338. {
  339. if (folio_test_hugetlb(folio))
  340. return;
  341. /*
  342. * TODO: Could shrink slab caches here if a lightweight range-based
  343. * shrinker will be available.
  344. */
  345. if (folio_test_slab(folio))
  346. return;
  347. lru_add_drain_all();
  348. }
  349. EXPORT_SYMBOL_GPL(shake_folio);
  350. static void shake_page(struct page *page)
  351. {
  352. shake_folio(page_folio(page));
  353. }
  354. static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  355. unsigned long address)
  356. {
  357. unsigned long ret = 0;
  358. pgd_t *pgd;
  359. p4d_t *p4d;
  360. pud_t *pud;
  361. pmd_t *pmd;
  362. pte_t *pte;
  363. pte_t ptent;
  364. VM_BUG_ON_VMA(address == -EFAULT, vma);
  365. pgd = pgd_offset(vma->vm_mm, address);
  366. if (!pgd_present(*pgd))
  367. return 0;
  368. p4d = p4d_offset(pgd, address);
  369. if (!p4d_present(*p4d))
  370. return 0;
  371. pud = pud_offset(p4d, address);
  372. if (!pud_present(*pud))
  373. return 0;
  374. if (pud_devmap(*pud))
  375. return PUD_SHIFT;
  376. pmd = pmd_offset(pud, address);
  377. if (!pmd_present(*pmd))
  378. return 0;
  379. if (pmd_devmap(*pmd))
  380. return PMD_SHIFT;
  381. pte = pte_offset_map(pmd, address);
  382. if (!pte)
  383. return 0;
  384. ptent = ptep_get(pte);
  385. if (pte_present(ptent) && pte_devmap(ptent))
  386. ret = PAGE_SHIFT;
  387. pte_unmap(pte);
  388. return ret;
  389. }
  390. /*
  391. * Failure handling: if we can't find or can't kill a process there's
  392. * not much we can do. We just print a message and ignore otherwise.
  393. */
  394. /*
  395. * Schedule a process for later kill.
  396. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  397. */
  398. static void __add_to_kill(struct task_struct *tsk, struct page *p,
  399. struct vm_area_struct *vma, struct list_head *to_kill,
  400. unsigned long addr)
  401. {
  402. struct to_kill *tk;
  403. tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
  404. if (!tk) {
  405. pr_err("Out of memory while machine check handling\n");
  406. return;
  407. }
  408. tk->addr = addr;
  409. if (is_zone_device_page(p))
  410. tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
  411. else
  412. tk->size_shift = page_shift(compound_head(p));
  413. /*
  414. * Send SIGKILL if "tk->addr == -EFAULT". Also, as
  415. * "tk->size_shift" is always non-zero for !is_zone_device_page(),
  416. * so "tk->size_shift == 0" effectively checks no mapping on
  417. * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
  418. * to a process' address space, it's possible not all N VMAs
  419. * contain mappings for the page, but at least one VMA does.
  420. * Only deliver SIGBUS with payload derived from the VMA that
  421. * has a mapping for the page.
  422. */
  423. if (tk->addr == -EFAULT) {
  424. pr_info("Unable to find user space address %lx in %s\n",
  425. page_to_pfn(p), tsk->comm);
  426. } else if (tk->size_shift == 0) {
  427. kfree(tk);
  428. return;
  429. }
  430. get_task_struct(tsk);
  431. tk->tsk = tsk;
  432. list_add_tail(&tk->nd, to_kill);
  433. }
  434. static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
  435. struct vm_area_struct *vma, struct list_head *to_kill,
  436. unsigned long addr)
  437. {
  438. if (addr == -EFAULT)
  439. return;
  440. __add_to_kill(tsk, p, vma, to_kill, addr);
  441. }
  442. #ifdef CONFIG_KSM
  443. static bool task_in_to_kill_list(struct list_head *to_kill,
  444. struct task_struct *tsk)
  445. {
  446. struct to_kill *tk, *next;
  447. list_for_each_entry_safe(tk, next, to_kill, nd) {
  448. if (tk->tsk == tsk)
  449. return true;
  450. }
  451. return false;
  452. }
  453. void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
  454. struct vm_area_struct *vma, struct list_head *to_kill,
  455. unsigned long addr)
  456. {
  457. if (!task_in_to_kill_list(to_kill, tsk))
  458. __add_to_kill(tsk, p, vma, to_kill, addr);
  459. }
  460. #endif
  461. /*
  462. * Kill the processes that have been collected earlier.
  463. *
  464. * Only do anything when FORCEKILL is set, otherwise just free the
  465. * list (this is used for clean pages which do not need killing)
  466. */
  467. static void kill_procs(struct list_head *to_kill, int forcekill,
  468. unsigned long pfn, int flags)
  469. {
  470. struct to_kill *tk, *next;
  471. list_for_each_entry_safe(tk, next, to_kill, nd) {
  472. if (forcekill) {
  473. if (tk->addr == -EFAULT) {
  474. pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
  475. pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
  476. do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
  477. tk->tsk, PIDTYPE_PID);
  478. }
  479. /*
  480. * In theory the process could have mapped
  481. * something else on the address in-between. We could
  482. * check for that, but we need to tell the
  483. * process anyways.
  484. */
  485. else if (kill_proc(tk, pfn, flags) < 0)
  486. pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
  487. pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
  488. }
  489. list_del(&tk->nd);
  490. put_task_struct(tk->tsk);
  491. kfree(tk);
  492. }
  493. }
  494. /*
  495. * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
  496. * on behalf of the thread group. Return task_struct of the (first found)
  497. * dedicated thread if found, and return NULL otherwise.
  498. *
  499. * We already hold rcu lock in the caller, so we don't have to call
  500. * rcu_read_lock/unlock() in this function.
  501. */
  502. static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
  503. {
  504. struct task_struct *t;
  505. for_each_thread(tsk, t) {
  506. if (t->flags & PF_MCE_PROCESS) {
  507. if (t->flags & PF_MCE_EARLY)
  508. return t;
  509. } else {
  510. if (sysctl_memory_failure_early_kill)
  511. return t;
  512. }
  513. }
  514. return NULL;
  515. }
  516. /*
  517. * Determine whether a given process is "early kill" process which expects
  518. * to be signaled when some page under the process is hwpoisoned.
  519. * Return task_struct of the dedicated thread (main thread unless explicitly
  520. * specified) if the process is "early kill" and otherwise returns NULL.
  521. *
  522. * Note that the above is true for Action Optional case. For Action Required
  523. * case, it's only meaningful to the current thread which need to be signaled
  524. * with SIGBUS, this error is Action Optional for other non current
  525. * processes sharing the same error page,if the process is "early kill", the
  526. * task_struct of the dedicated thread will also be returned.
  527. */
  528. struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
  529. {
  530. if (!tsk->mm)
  531. return NULL;
  532. /*
  533. * Comparing ->mm here because current task might represent
  534. * a subthread, while tsk always points to the main thread.
  535. */
  536. if (force_early && tsk->mm == current->mm)
  537. return current;
  538. return find_early_kill_thread(tsk);
  539. }
  540. /*
  541. * Collect processes when the error hit an anonymous page.
  542. */
  543. static void collect_procs_anon(struct folio *folio, struct page *page,
  544. struct list_head *to_kill, int force_early)
  545. {
  546. struct task_struct *tsk;
  547. struct anon_vma *av;
  548. pgoff_t pgoff;
  549. av = folio_lock_anon_vma_read(folio, NULL);
  550. if (av == NULL) /* Not actually mapped anymore */
  551. return;
  552. pgoff = page_to_pgoff(page);
  553. rcu_read_lock();
  554. for_each_process(tsk) {
  555. struct vm_area_struct *vma;
  556. struct anon_vma_chain *vmac;
  557. struct task_struct *t = task_early_kill(tsk, force_early);
  558. unsigned long addr;
  559. if (!t)
  560. continue;
  561. anon_vma_interval_tree_foreach(vmac, &av->rb_root,
  562. pgoff, pgoff) {
  563. vma = vmac->vma;
  564. if (vma->vm_mm != t->mm)
  565. continue;
  566. addr = page_mapped_in_vma(page, vma);
  567. add_to_kill_anon_file(t, page, vma, to_kill, addr);
  568. }
  569. }
  570. rcu_read_unlock();
  571. anon_vma_unlock_read(av);
  572. }
  573. /*
  574. * Collect processes when the error hit a file mapped page.
  575. */
  576. static void collect_procs_file(struct folio *folio, struct page *page,
  577. struct list_head *to_kill, int force_early)
  578. {
  579. struct vm_area_struct *vma;
  580. struct task_struct *tsk;
  581. struct address_space *mapping = folio->mapping;
  582. pgoff_t pgoff;
  583. i_mmap_lock_read(mapping);
  584. rcu_read_lock();
  585. pgoff = page_to_pgoff(page);
  586. for_each_process(tsk) {
  587. struct task_struct *t = task_early_kill(tsk, force_early);
  588. unsigned long addr;
  589. if (!t)
  590. continue;
  591. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
  592. pgoff) {
  593. /*
  594. * Send early kill signal to tasks where a vma covers
  595. * the page but the corrupted page is not necessarily
  596. * mapped in its pte.
  597. * Assume applications who requested early kill want
  598. * to be informed of all such data corruptions.
  599. */
  600. if (vma->vm_mm != t->mm)
  601. continue;
  602. addr = page_address_in_vma(page, vma);
  603. add_to_kill_anon_file(t, page, vma, to_kill, addr);
  604. }
  605. }
  606. rcu_read_unlock();
  607. i_mmap_unlock_read(mapping);
  608. }
  609. #ifdef CONFIG_FS_DAX
  610. static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
  611. struct vm_area_struct *vma,
  612. struct list_head *to_kill, pgoff_t pgoff)
  613. {
  614. unsigned long addr = vma_address(vma, pgoff, 1);
  615. __add_to_kill(tsk, p, vma, to_kill, addr);
  616. }
  617. /*
  618. * Collect processes when the error hit a fsdax page.
  619. */
  620. static void collect_procs_fsdax(struct page *page,
  621. struct address_space *mapping, pgoff_t pgoff,
  622. struct list_head *to_kill, bool pre_remove)
  623. {
  624. struct vm_area_struct *vma;
  625. struct task_struct *tsk;
  626. i_mmap_lock_read(mapping);
  627. rcu_read_lock();
  628. for_each_process(tsk) {
  629. struct task_struct *t = tsk;
  630. /*
  631. * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
  632. * the current may not be the one accessing the fsdax page.
  633. * Otherwise, search for the current task.
  634. */
  635. if (!pre_remove)
  636. t = task_early_kill(tsk, true);
  637. if (!t)
  638. continue;
  639. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  640. if (vma->vm_mm == t->mm)
  641. add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
  642. }
  643. }
  644. rcu_read_unlock();
  645. i_mmap_unlock_read(mapping);
  646. }
  647. #endif /* CONFIG_FS_DAX */
  648. /*
  649. * Collect the processes who have the corrupted page mapped to kill.
  650. */
  651. static void collect_procs(struct folio *folio, struct page *page,
  652. struct list_head *tokill, int force_early)
  653. {
  654. if (!folio->mapping)
  655. return;
  656. if (unlikely(folio_test_ksm(folio)))
  657. collect_procs_ksm(folio, page, tokill, force_early);
  658. else if (folio_test_anon(folio))
  659. collect_procs_anon(folio, page, tokill, force_early);
  660. else
  661. collect_procs_file(folio, page, tokill, force_early);
  662. }
  663. struct hwpoison_walk {
  664. struct to_kill tk;
  665. unsigned long pfn;
  666. int flags;
  667. };
  668. static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
  669. {
  670. tk->addr = addr;
  671. tk->size_shift = shift;
  672. }
  673. static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
  674. unsigned long poisoned_pfn, struct to_kill *tk)
  675. {
  676. unsigned long pfn = 0;
  677. if (pte_present(pte)) {
  678. pfn = pte_pfn(pte);
  679. } else {
  680. swp_entry_t swp = pte_to_swp_entry(pte);
  681. if (is_hwpoison_entry(swp))
  682. pfn = swp_offset_pfn(swp);
  683. }
  684. if (!pfn || pfn != poisoned_pfn)
  685. return 0;
  686. set_to_kill(tk, addr, shift);
  687. return 1;
  688. }
  689. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  690. static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
  691. struct hwpoison_walk *hwp)
  692. {
  693. pmd_t pmd = *pmdp;
  694. unsigned long pfn;
  695. unsigned long hwpoison_vaddr;
  696. if (!pmd_present(pmd))
  697. return 0;
  698. pfn = pmd_pfn(pmd);
  699. if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
  700. hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
  701. set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
  702. return 1;
  703. }
  704. return 0;
  705. }
  706. #else
  707. static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
  708. struct hwpoison_walk *hwp)
  709. {
  710. return 0;
  711. }
  712. #endif
  713. static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
  714. unsigned long end, struct mm_walk *walk)
  715. {
  716. struct hwpoison_walk *hwp = walk->private;
  717. int ret = 0;
  718. pte_t *ptep, *mapped_pte;
  719. spinlock_t *ptl;
  720. ptl = pmd_trans_huge_lock(pmdp, walk->vma);
  721. if (ptl) {
  722. ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
  723. spin_unlock(ptl);
  724. goto out;
  725. }
  726. mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
  727. addr, &ptl);
  728. if (!ptep)
  729. goto out;
  730. for (; addr != end; ptep++, addr += PAGE_SIZE) {
  731. ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
  732. hwp->pfn, &hwp->tk);
  733. if (ret == 1)
  734. break;
  735. }
  736. pte_unmap_unlock(mapped_pte, ptl);
  737. out:
  738. cond_resched();
  739. return ret;
  740. }
  741. #ifdef CONFIG_HUGETLB_PAGE
  742. static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
  743. unsigned long addr, unsigned long end,
  744. struct mm_walk *walk)
  745. {
  746. struct hwpoison_walk *hwp = walk->private;
  747. pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
  748. struct hstate *h = hstate_vma(walk->vma);
  749. return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
  750. hwp->pfn, &hwp->tk);
  751. }
  752. #else
  753. #define hwpoison_hugetlb_range NULL
  754. #endif
  755. static const struct mm_walk_ops hwpoison_walk_ops = {
  756. .pmd_entry = hwpoison_pte_range,
  757. .hugetlb_entry = hwpoison_hugetlb_range,
  758. .walk_lock = PGWALK_RDLOCK,
  759. };
  760. /*
  761. * Sends SIGBUS to the current process with error info.
  762. *
  763. * This function is intended to handle "Action Required" MCEs on already
  764. * hardware poisoned pages. They could happen, for example, when
  765. * memory_failure() failed to unmap the error page at the first call, or
  766. * when multiple local machine checks happened on different CPUs.
  767. *
  768. * MCE handler currently has no easy access to the error virtual address,
  769. * so this function walks page table to find it. The returned virtual address
  770. * is proper in most cases, but it could be wrong when the application
  771. * process has multiple entries mapping the error page.
  772. */
  773. static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
  774. int flags)
  775. {
  776. int ret;
  777. struct hwpoison_walk priv = {
  778. .pfn = pfn,
  779. };
  780. priv.tk.tsk = p;
  781. if (!p->mm)
  782. return -EFAULT;
  783. mmap_read_lock(p->mm);
  784. ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
  785. (void *)&priv);
  786. if (ret == 1 && priv.tk.addr)
  787. kill_proc(&priv.tk, pfn, flags);
  788. else
  789. ret = 0;
  790. mmap_read_unlock(p->mm);
  791. return ret > 0 ? -EHWPOISON : -EFAULT;
  792. }
  793. /*
  794. * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  795. * But it could not do more to isolate the page from being accessed again,
  796. * nor does it kill the process. This is extremely rare and one of the
  797. * potential causes is that the page state has been changed due to
  798. * underlying race condition. This is the most severe outcomes.
  799. *
  800. * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  801. * It should have killed the process, but it can't isolate the page,
  802. * due to conditions such as extra pin, unmap failure, etc. Accessing
  803. * the page again may trigger another MCE and the process will be killed
  804. * by the m-f() handler immediately.
  805. *
  806. * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  807. * The page is unmapped, and is removed from the LRU or file mapping.
  808. * An attempt to access the page again will trigger page fault and the
  809. * PF handler will kill the process.
  810. *
  811. * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed.
  812. * The page has been completely isolated, that is, unmapped, taken out of
  813. * the buddy system, or hole-punnched out of the file mapping.
  814. */
  815. static const char *action_name[] = {
  816. [MF_IGNORED] = "Ignored",
  817. [MF_FAILED] = "Failed",
  818. [MF_DELAYED] = "Delayed",
  819. [MF_RECOVERED] = "Recovered",
  820. };
  821. static const char * const action_page_types[] = {
  822. [MF_MSG_KERNEL] = "reserved kernel page",
  823. [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
  824. [MF_MSG_HUGE] = "huge page",
  825. [MF_MSG_FREE_HUGE] = "free huge page",
  826. [MF_MSG_GET_HWPOISON] = "get hwpoison page",
  827. [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
  828. [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
  829. [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
  830. [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
  831. [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
  832. [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
  833. [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
  834. [MF_MSG_DIRTY_LRU] = "dirty LRU page",
  835. [MF_MSG_CLEAN_LRU] = "clean LRU page",
  836. [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
  837. [MF_MSG_BUDDY] = "free buddy page",
  838. [MF_MSG_DAX] = "dax page",
  839. [MF_MSG_UNSPLIT_THP] = "unsplit thp",
  840. [MF_MSG_ALREADY_POISONED] = "already poisoned",
  841. [MF_MSG_UNKNOWN] = "unknown page",
  842. };
  843. /*
  844. * XXX: It is possible that a page is isolated from LRU cache,
  845. * and then kept in swap cache or failed to remove from page cache.
  846. * The page count will stop it from being freed by unpoison.
  847. * Stress tests should be aware of this memory leak problem.
  848. */
  849. static int delete_from_lru_cache(struct folio *folio)
  850. {
  851. if (folio_isolate_lru(folio)) {
  852. /*
  853. * Clear sensible page flags, so that the buddy system won't
  854. * complain when the folio is unpoison-and-freed.
  855. */
  856. folio_clear_active(folio);
  857. folio_clear_unevictable(folio);
  858. /*
  859. * Poisoned page might never drop its ref count to 0 so we have
  860. * to uncharge it manually from its memcg.
  861. */
  862. mem_cgroup_uncharge(folio);
  863. /*
  864. * drop the refcount elevated by folio_isolate_lru()
  865. */
  866. folio_put(folio);
  867. return 0;
  868. }
  869. return -EIO;
  870. }
  871. static int truncate_error_folio(struct folio *folio, unsigned long pfn,
  872. struct address_space *mapping)
  873. {
  874. int ret = MF_FAILED;
  875. if (mapping->a_ops->error_remove_folio) {
  876. int err = mapping->a_ops->error_remove_folio(mapping, folio);
  877. if (err != 0)
  878. pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
  879. else if (!filemap_release_folio(folio, GFP_NOIO))
  880. pr_info("%#lx: failed to release buffers\n", pfn);
  881. else
  882. ret = MF_RECOVERED;
  883. } else {
  884. /*
  885. * If the file system doesn't support it just invalidate
  886. * This fails on dirty or anything with private pages
  887. */
  888. if (mapping_evict_folio(mapping, folio))
  889. ret = MF_RECOVERED;
  890. else
  891. pr_info("%#lx: Failed to invalidate\n", pfn);
  892. }
  893. return ret;
  894. }
  895. struct page_state {
  896. unsigned long mask;
  897. unsigned long res;
  898. enum mf_action_page_type type;
  899. /* Callback ->action() has to unlock the relevant page inside it. */
  900. int (*action)(struct page_state *ps, struct page *p);
  901. };
  902. /*
  903. * Return true if page is still referenced by others, otherwise return
  904. * false.
  905. *
  906. * The extra_pins is true when one extra refcount is expected.
  907. */
  908. static bool has_extra_refcount(struct page_state *ps, struct page *p,
  909. bool extra_pins)
  910. {
  911. int count = page_count(p) - 1;
  912. if (extra_pins)
  913. count -= folio_nr_pages(page_folio(p));
  914. if (count > 0) {
  915. pr_err("%#lx: %s still referenced by %d users\n",
  916. page_to_pfn(p), action_page_types[ps->type], count);
  917. return true;
  918. }
  919. return false;
  920. }
  921. /*
  922. * Error hit kernel page.
  923. * Do nothing, try to be lucky and not touch this instead. For a few cases we
  924. * could be more sophisticated.
  925. */
  926. static int me_kernel(struct page_state *ps, struct page *p)
  927. {
  928. unlock_page(p);
  929. return MF_IGNORED;
  930. }
  931. /*
  932. * Page in unknown state. Do nothing.
  933. * This is a catch-all in case we fail to make sense of the page state.
  934. */
  935. static int me_unknown(struct page_state *ps, struct page *p)
  936. {
  937. pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
  938. unlock_page(p);
  939. return MF_IGNORED;
  940. }
  941. /*
  942. * Clean (or cleaned) page cache page.
  943. */
  944. static int me_pagecache_clean(struct page_state *ps, struct page *p)
  945. {
  946. struct folio *folio = page_folio(p);
  947. int ret;
  948. struct address_space *mapping;
  949. bool extra_pins;
  950. delete_from_lru_cache(folio);
  951. /*
  952. * For anonymous folios the only reference left
  953. * should be the one m_f() holds.
  954. */
  955. if (folio_test_anon(folio)) {
  956. ret = MF_RECOVERED;
  957. goto out;
  958. }
  959. /*
  960. * Now truncate the page in the page cache. This is really
  961. * more like a "temporary hole punch"
  962. * Don't do this for block devices when someone else
  963. * has a reference, because it could be file system metadata
  964. * and that's not safe to truncate.
  965. */
  966. mapping = folio_mapping(folio);
  967. if (!mapping) {
  968. /* Folio has been torn down in the meantime */
  969. ret = MF_FAILED;
  970. goto out;
  971. }
  972. /*
  973. * The shmem page is kept in page cache instead of truncating
  974. * so is expected to have an extra refcount after error-handling.
  975. */
  976. extra_pins = shmem_mapping(mapping);
  977. /*
  978. * Truncation is a bit tricky. Enable it per file system for now.
  979. *
  980. * Open: to take i_rwsem or not for this? Right now we don't.
  981. */
  982. ret = truncate_error_folio(folio, page_to_pfn(p), mapping);
  983. if (has_extra_refcount(ps, p, extra_pins))
  984. ret = MF_FAILED;
  985. out:
  986. folio_unlock(folio);
  987. return ret;
  988. }
  989. /*
  990. * Dirty pagecache page
  991. * Issues: when the error hit a hole page the error is not properly
  992. * propagated.
  993. */
  994. static int me_pagecache_dirty(struct page_state *ps, struct page *p)
  995. {
  996. struct folio *folio = page_folio(p);
  997. struct address_space *mapping = folio_mapping(folio);
  998. /* TBD: print more information about the file. */
  999. if (mapping) {
  1000. /*
  1001. * IO error will be reported by write(), fsync(), etc.
  1002. * who check the mapping.
  1003. * This way the application knows that something went
  1004. * wrong with its dirty file data.
  1005. */
  1006. mapping_set_error(mapping, -EIO);
  1007. }
  1008. return me_pagecache_clean(ps, p);
  1009. }
  1010. /*
  1011. * Clean and dirty swap cache.
  1012. *
  1013. * Dirty swap cache page is tricky to handle. The page could live both in page
  1014. * table and swap cache(ie. page is freshly swapped in). So it could be
  1015. * referenced concurrently by 2 types of PTEs:
  1016. * normal PTEs and swap PTEs. We try to handle them consistently by calling
  1017. * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
  1018. * and then
  1019. * - clear dirty bit to prevent IO
  1020. * - remove from LRU
  1021. * - but keep in the swap cache, so that when we return to it on
  1022. * a later page fault, we know the application is accessing
  1023. * corrupted data and shall be killed (we installed simple
  1024. * interception code in do_swap_page to catch it).
  1025. *
  1026. * Clean swap cache pages can be directly isolated. A later page fault will
  1027. * bring in the known good data from disk.
  1028. */
  1029. static int me_swapcache_dirty(struct page_state *ps, struct page *p)
  1030. {
  1031. struct folio *folio = page_folio(p);
  1032. int ret;
  1033. bool extra_pins = false;
  1034. folio_clear_dirty(folio);
  1035. /* Trigger EIO in shmem: */
  1036. folio_clear_uptodate(folio);
  1037. ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED;
  1038. folio_unlock(folio);
  1039. if (ret == MF_DELAYED)
  1040. extra_pins = true;
  1041. if (has_extra_refcount(ps, p, extra_pins))
  1042. ret = MF_FAILED;
  1043. return ret;
  1044. }
  1045. static int me_swapcache_clean(struct page_state *ps, struct page *p)
  1046. {
  1047. struct folio *folio = page_folio(p);
  1048. int ret;
  1049. delete_from_swap_cache(folio);
  1050. ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
  1051. folio_unlock(folio);
  1052. if (has_extra_refcount(ps, p, false))
  1053. ret = MF_FAILED;
  1054. return ret;
  1055. }
  1056. /*
  1057. * Huge pages. Needs work.
  1058. * Issues:
  1059. * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  1060. * To narrow down kill region to one page, we need to break up pmd.
  1061. */
  1062. static int me_huge_page(struct page_state *ps, struct page *p)
  1063. {
  1064. struct folio *folio = page_folio(p);
  1065. int res;
  1066. struct address_space *mapping;
  1067. bool extra_pins = false;
  1068. mapping = folio_mapping(folio);
  1069. if (mapping) {
  1070. res = truncate_error_folio(folio, page_to_pfn(p), mapping);
  1071. /* The page is kept in page cache. */
  1072. extra_pins = true;
  1073. folio_unlock(folio);
  1074. } else {
  1075. folio_unlock(folio);
  1076. /*
  1077. * migration entry prevents later access on error hugepage,
  1078. * so we can free and dissolve it into buddy to save healthy
  1079. * subpages.
  1080. */
  1081. folio_put(folio);
  1082. if (__page_handle_poison(p) > 0) {
  1083. page_ref_inc(p);
  1084. res = MF_RECOVERED;
  1085. } else {
  1086. res = MF_FAILED;
  1087. }
  1088. }
  1089. if (has_extra_refcount(ps, p, extra_pins))
  1090. res = MF_FAILED;
  1091. return res;
  1092. }
  1093. /*
  1094. * Various page states we can handle.
  1095. *
  1096. * A page state is defined by its current page->flags bits.
  1097. * The table matches them in order and calls the right handler.
  1098. *
  1099. * This is quite tricky because we can access page at any time
  1100. * in its live cycle, so all accesses have to be extremely careful.
  1101. *
  1102. * This is not complete. More states could be added.
  1103. * For any missing state don't attempt recovery.
  1104. */
  1105. #define dirty (1UL << PG_dirty)
  1106. #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
  1107. #define unevict (1UL << PG_unevictable)
  1108. #define mlock (1UL << PG_mlocked)
  1109. #define lru (1UL << PG_lru)
  1110. #define head (1UL << PG_head)
  1111. #define reserved (1UL << PG_reserved)
  1112. static struct page_state error_states[] = {
  1113. { reserved, reserved, MF_MSG_KERNEL, me_kernel },
  1114. /*
  1115. * free pages are specially detected outside this table:
  1116. * PG_buddy pages only make a small fraction of all free pages.
  1117. */
  1118. { head, head, MF_MSG_HUGE, me_huge_page },
  1119. { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
  1120. { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
  1121. { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
  1122. { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
  1123. { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
  1124. { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
  1125. { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
  1126. { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
  1127. /*
  1128. * Catchall entry: must be at end.
  1129. */
  1130. { 0, 0, MF_MSG_UNKNOWN, me_unknown },
  1131. };
  1132. #undef dirty
  1133. #undef sc
  1134. #undef unevict
  1135. #undef mlock
  1136. #undef lru
  1137. #undef head
  1138. #undef reserved
  1139. static void update_per_node_mf_stats(unsigned long pfn,
  1140. enum mf_result result)
  1141. {
  1142. int nid = MAX_NUMNODES;
  1143. struct memory_failure_stats *mf_stats = NULL;
  1144. nid = pfn_to_nid(pfn);
  1145. if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
  1146. WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
  1147. return;
  1148. }
  1149. mf_stats = &NODE_DATA(nid)->mf_stats;
  1150. switch (result) {
  1151. case MF_IGNORED:
  1152. ++mf_stats->ignored;
  1153. break;
  1154. case MF_FAILED:
  1155. ++mf_stats->failed;
  1156. break;
  1157. case MF_DELAYED:
  1158. ++mf_stats->delayed;
  1159. break;
  1160. case MF_RECOVERED:
  1161. ++mf_stats->recovered;
  1162. break;
  1163. default:
  1164. WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
  1165. break;
  1166. }
  1167. ++mf_stats->total;
  1168. }
  1169. /*
  1170. * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  1171. * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  1172. */
  1173. static int action_result(unsigned long pfn, enum mf_action_page_type type,
  1174. enum mf_result result)
  1175. {
  1176. trace_memory_failure_event(pfn, type, result);
  1177. num_poisoned_pages_inc(pfn);
  1178. update_per_node_mf_stats(pfn, result);
  1179. pr_err("%#lx: recovery action for %s: %s\n",
  1180. pfn, action_page_types[type], action_name[result]);
  1181. return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
  1182. }
  1183. static int page_action(struct page_state *ps, struct page *p,
  1184. unsigned long pfn)
  1185. {
  1186. int result;
  1187. /* page p should be unlocked after returning from ps->action(). */
  1188. result = ps->action(ps, p);
  1189. /* Could do more checks here if page looks ok */
  1190. /*
  1191. * Could adjust zone counters here to correct for the missing page.
  1192. */
  1193. return action_result(pfn, ps->type, result);
  1194. }
  1195. static inline bool PageHWPoisonTakenOff(struct page *page)
  1196. {
  1197. return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
  1198. }
  1199. void SetPageHWPoisonTakenOff(struct page *page)
  1200. {
  1201. set_page_private(page, MAGIC_HWPOISON);
  1202. }
  1203. void ClearPageHWPoisonTakenOff(struct page *page)
  1204. {
  1205. if (PageHWPoison(page))
  1206. set_page_private(page, 0);
  1207. }
  1208. /*
  1209. * Return true if a page type of a given page is supported by hwpoison
  1210. * mechanism (while handling could fail), otherwise false. This function
  1211. * does not return true for hugetlb or device memory pages, so it's assumed
  1212. * to be called only in the context where we never have such pages.
  1213. */
  1214. static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
  1215. {
  1216. if (PageSlab(page))
  1217. return false;
  1218. /* Soft offline could migrate non-LRU movable pages */
  1219. if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
  1220. return true;
  1221. return PageLRU(page) || is_free_buddy_page(page);
  1222. }
  1223. static int __get_hwpoison_page(struct page *page, unsigned long flags)
  1224. {
  1225. struct folio *folio = page_folio(page);
  1226. int ret = 0;
  1227. bool hugetlb = false;
  1228. ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
  1229. if (hugetlb) {
  1230. /* Make sure hugetlb demotion did not happen from under us. */
  1231. if (folio == page_folio(page))
  1232. return ret;
  1233. if (ret > 0) {
  1234. folio_put(folio);
  1235. folio = page_folio(page);
  1236. }
  1237. }
  1238. /*
  1239. * This check prevents from calling folio_try_get() for any
  1240. * unsupported type of folio in order to reduce the risk of unexpected
  1241. * races caused by taking a folio refcount.
  1242. */
  1243. if (!HWPoisonHandlable(&folio->page, flags))
  1244. return -EBUSY;
  1245. if (folio_try_get(folio)) {
  1246. if (folio == page_folio(page))
  1247. return 1;
  1248. pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
  1249. folio_put(folio);
  1250. }
  1251. return 0;
  1252. }
  1253. #define GET_PAGE_MAX_RETRY_NUM 3
  1254. static int get_any_page(struct page *p, unsigned long flags)
  1255. {
  1256. int ret = 0, pass = 0;
  1257. bool count_increased = false;
  1258. if (flags & MF_COUNT_INCREASED)
  1259. count_increased = true;
  1260. try_again:
  1261. if (!count_increased) {
  1262. ret = __get_hwpoison_page(p, flags);
  1263. if (!ret) {
  1264. if (page_count(p)) {
  1265. /* We raced with an allocation, retry. */
  1266. if (pass++ < GET_PAGE_MAX_RETRY_NUM)
  1267. goto try_again;
  1268. ret = -EBUSY;
  1269. } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
  1270. /* We raced with put_page, retry. */
  1271. if (pass++ < GET_PAGE_MAX_RETRY_NUM)
  1272. goto try_again;
  1273. ret = -EIO;
  1274. }
  1275. goto out;
  1276. } else if (ret == -EBUSY) {
  1277. /*
  1278. * We raced with (possibly temporary) unhandlable
  1279. * page, retry.
  1280. */
  1281. if (pass++ < 3) {
  1282. shake_page(p);
  1283. goto try_again;
  1284. }
  1285. ret = -EIO;
  1286. goto out;
  1287. }
  1288. }
  1289. if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
  1290. ret = 1;
  1291. } else {
  1292. /*
  1293. * A page we cannot handle. Check whether we can turn
  1294. * it into something we can handle.
  1295. */
  1296. if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
  1297. put_page(p);
  1298. shake_page(p);
  1299. count_increased = false;
  1300. goto try_again;
  1301. }
  1302. put_page(p);
  1303. ret = -EIO;
  1304. }
  1305. out:
  1306. if (ret == -EIO)
  1307. pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
  1308. return ret;
  1309. }
  1310. static int __get_unpoison_page(struct page *page)
  1311. {
  1312. struct folio *folio = page_folio(page);
  1313. int ret = 0;
  1314. bool hugetlb = false;
  1315. ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
  1316. if (hugetlb) {
  1317. /* Make sure hugetlb demotion did not happen from under us. */
  1318. if (folio == page_folio(page))
  1319. return ret;
  1320. if (ret > 0)
  1321. folio_put(folio);
  1322. }
  1323. /*
  1324. * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
  1325. * but also isolated from buddy freelist, so need to identify the
  1326. * state and have to cancel both operations to unpoison.
  1327. */
  1328. if (PageHWPoisonTakenOff(page))
  1329. return -EHWPOISON;
  1330. return get_page_unless_zero(page) ? 1 : 0;
  1331. }
  1332. /**
  1333. * get_hwpoison_page() - Get refcount for memory error handling
  1334. * @p: Raw error page (hit by memory error)
  1335. * @flags: Flags controlling behavior of error handling
  1336. *
  1337. * get_hwpoison_page() takes a page refcount of an error page to handle memory
  1338. * error on it, after checking that the error page is in a well-defined state
  1339. * (defined as a page-type we can successfully handle the memory error on it,
  1340. * such as LRU page and hugetlb page).
  1341. *
  1342. * Memory error handling could be triggered at any time on any type of page,
  1343. * so it's prone to race with typical memory management lifecycle (like
  1344. * allocation and free). So to avoid such races, get_hwpoison_page() takes
  1345. * extra care for the error page's state (as done in __get_hwpoison_page()),
  1346. * and has some retry logic in get_any_page().
  1347. *
  1348. * When called from unpoison_memory(), the caller should already ensure that
  1349. * the given page has PG_hwpoison. So it's never reused for other page
  1350. * allocations, and __get_unpoison_page() never races with them.
  1351. *
  1352. * Return: 0 on failure or free buddy (hugetlb) page,
  1353. * 1 on success for in-use pages in a well-defined state,
  1354. * -EIO for pages on which we can not handle memory errors,
  1355. * -EBUSY when get_hwpoison_page() has raced with page lifecycle
  1356. * operations like allocation and free,
  1357. * -EHWPOISON when the page is hwpoisoned and taken off from buddy.
  1358. */
  1359. static int get_hwpoison_page(struct page *p, unsigned long flags)
  1360. {
  1361. int ret;
  1362. zone_pcp_disable(page_zone(p));
  1363. if (flags & MF_UNPOISON)
  1364. ret = __get_unpoison_page(p);
  1365. else
  1366. ret = get_any_page(p, flags);
  1367. zone_pcp_enable(page_zone(p));
  1368. return ret;
  1369. }
  1370. void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
  1371. {
  1372. if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
  1373. struct address_space *mapping;
  1374. /*
  1375. * For hugetlb folios in shared mappings, try_to_unmap
  1376. * could potentially call huge_pmd_unshare. Because of
  1377. * this, take semaphore in write mode here and set
  1378. * TTU_RMAP_LOCKED to indicate we have taken the lock
  1379. * at this higher level.
  1380. */
  1381. mapping = hugetlb_folio_mapping_lock_write(folio);
  1382. if (!mapping) {
  1383. pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
  1384. folio_pfn(folio));
  1385. return;
  1386. }
  1387. try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
  1388. i_mmap_unlock_write(mapping);
  1389. } else {
  1390. try_to_unmap(folio, ttu);
  1391. }
  1392. }
  1393. /*
  1394. * Do all that is necessary to remove user space mappings. Unmap
  1395. * the pages and send SIGBUS to the processes if the data was dirty.
  1396. */
  1397. static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
  1398. unsigned long pfn, int flags)
  1399. {
  1400. enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
  1401. struct address_space *mapping;
  1402. LIST_HEAD(tokill);
  1403. bool unmap_success;
  1404. int forcekill;
  1405. bool mlocked = folio_test_mlocked(folio);
  1406. /*
  1407. * Here we are interested only in user-mapped pages, so skip any
  1408. * other types of pages.
  1409. */
  1410. if (folio_test_reserved(folio) || folio_test_slab(folio) ||
  1411. folio_test_pgtable(folio) || folio_test_offline(folio))
  1412. return true;
  1413. if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
  1414. return true;
  1415. /*
  1416. * This check implies we don't kill processes if their pages
  1417. * are in the swap cache early. Those are always late kills.
  1418. */
  1419. if (!folio_mapped(folio))
  1420. return true;
  1421. if (folio_test_swapcache(folio)) {
  1422. pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
  1423. ttu &= ~TTU_HWPOISON;
  1424. }
  1425. /*
  1426. * Propagate the dirty bit from PTEs to struct page first, because we
  1427. * need this to decide if we should kill or just drop the page.
  1428. * XXX: the dirty test could be racy: set_page_dirty() may not always
  1429. * be called inside page lock (it's recommended but not enforced).
  1430. */
  1431. mapping = folio_mapping(folio);
  1432. if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
  1433. mapping_can_writeback(mapping)) {
  1434. if (folio_mkclean(folio)) {
  1435. folio_set_dirty(folio);
  1436. } else {
  1437. ttu &= ~TTU_HWPOISON;
  1438. pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
  1439. pfn);
  1440. }
  1441. }
  1442. /*
  1443. * First collect all the processes that have the page
  1444. * mapped in dirty form. This has to be done before try_to_unmap,
  1445. * because ttu takes the rmap data structures down.
  1446. */
  1447. collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
  1448. unmap_poisoned_folio(folio, ttu);
  1449. unmap_success = !folio_mapped(folio);
  1450. if (!unmap_success)
  1451. pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
  1452. pfn, folio_mapcount(folio));
  1453. /*
  1454. * try_to_unmap() might put mlocked page in lru cache, so call
  1455. * shake_page() again to ensure that it's flushed.
  1456. */
  1457. if (mlocked)
  1458. shake_folio(folio);
  1459. /*
  1460. * Now that the dirty bit has been propagated to the
  1461. * struct page and all unmaps done we can decide if
  1462. * killing is needed or not. Only kill when the page
  1463. * was dirty or the process is not restartable,
  1464. * otherwise the tokill list is merely
  1465. * freed. When there was a problem unmapping earlier
  1466. * use a more force-full uncatchable kill to prevent
  1467. * any accesses to the poisoned memory.
  1468. */
  1469. forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
  1470. !unmap_success;
  1471. kill_procs(&tokill, forcekill, pfn, flags);
  1472. return unmap_success;
  1473. }
  1474. static int identify_page_state(unsigned long pfn, struct page *p,
  1475. unsigned long page_flags)
  1476. {
  1477. struct page_state *ps;
  1478. /*
  1479. * The first check uses the current page flags which may not have any
  1480. * relevant information. The second check with the saved page flags is
  1481. * carried out only if the first check can't determine the page status.
  1482. */
  1483. for (ps = error_states;; ps++)
  1484. if ((p->flags & ps->mask) == ps->res)
  1485. break;
  1486. page_flags |= (p->flags & (1UL << PG_dirty));
  1487. if (!ps->mask)
  1488. for (ps = error_states;; ps++)
  1489. if ((page_flags & ps->mask) == ps->res)
  1490. break;
  1491. return page_action(ps, p, pfn);
  1492. }
  1493. /*
  1494. * When 'release' is 'false', it means that if thp split has failed,
  1495. * there is still more to do, hence the page refcount we took earlier
  1496. * is still needed.
  1497. */
  1498. static int try_to_split_thp_page(struct page *page, bool release)
  1499. {
  1500. int ret;
  1501. lock_page(page);
  1502. ret = split_huge_page(page);
  1503. unlock_page(page);
  1504. if (ret && release)
  1505. put_page(page);
  1506. return ret;
  1507. }
  1508. static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
  1509. struct address_space *mapping, pgoff_t index, int flags)
  1510. {
  1511. struct to_kill *tk;
  1512. unsigned long size = 0;
  1513. list_for_each_entry(tk, to_kill, nd)
  1514. if (tk->size_shift)
  1515. size = max(size, 1UL << tk->size_shift);
  1516. if (size) {
  1517. /*
  1518. * Unmap the largest mapping to avoid breaking up device-dax
  1519. * mappings which are constant size. The actual size of the
  1520. * mapping being torn down is communicated in siginfo, see
  1521. * kill_proc()
  1522. */
  1523. loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1);
  1524. unmap_mapping_range(mapping, start, size, 0);
  1525. }
  1526. kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
  1527. }
  1528. /*
  1529. * Only dev_pagemap pages get here, such as fsdax when the filesystem
  1530. * either do not claim or fails to claim a hwpoison event, or devdax.
  1531. * The fsdax pages are initialized per base page, and the devdax pages
  1532. * could be initialized either as base pages, or as compound pages with
  1533. * vmemmap optimization enabled. Devdax is simplistic in its dealing with
  1534. * hwpoison, such that, if a subpage of a compound page is poisoned,
  1535. * simply mark the compound head page is by far sufficient.
  1536. */
  1537. static int mf_generic_kill_procs(unsigned long long pfn, int flags,
  1538. struct dev_pagemap *pgmap)
  1539. {
  1540. struct folio *folio = pfn_folio(pfn);
  1541. LIST_HEAD(to_kill);
  1542. dax_entry_t cookie;
  1543. int rc = 0;
  1544. /*
  1545. * Prevent the inode from being freed while we are interrogating
  1546. * the address_space, typically this would be handled by
  1547. * lock_page(), but dax pages do not use the page lock. This
  1548. * also prevents changes to the mapping of this pfn until
  1549. * poison signaling is complete.
  1550. */
  1551. cookie = dax_lock_folio(folio);
  1552. if (!cookie)
  1553. return -EBUSY;
  1554. if (hwpoison_filter(&folio->page)) {
  1555. rc = -EOPNOTSUPP;
  1556. goto unlock;
  1557. }
  1558. switch (pgmap->type) {
  1559. case MEMORY_DEVICE_PRIVATE:
  1560. case MEMORY_DEVICE_COHERENT:
  1561. /*
  1562. * TODO: Handle device pages which may need coordination
  1563. * with device-side memory.
  1564. */
  1565. rc = -ENXIO;
  1566. goto unlock;
  1567. default:
  1568. break;
  1569. }
  1570. /*
  1571. * Use this flag as an indication that the dax page has been
  1572. * remapped UC to prevent speculative consumption of poison.
  1573. */
  1574. SetPageHWPoison(&folio->page);
  1575. /*
  1576. * Unlike System-RAM there is no possibility to swap in a
  1577. * different physical page at a given virtual address, so all
  1578. * userspace consumption of ZONE_DEVICE memory necessitates
  1579. * SIGBUS (i.e. MF_MUST_KILL)
  1580. */
  1581. flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
  1582. collect_procs(folio, &folio->page, &to_kill, true);
  1583. unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
  1584. unlock:
  1585. dax_unlock_folio(folio, cookie);
  1586. return rc;
  1587. }
  1588. #ifdef CONFIG_FS_DAX
  1589. /**
  1590. * mf_dax_kill_procs - Collect and kill processes who are using this file range
  1591. * @mapping: address_space of the file in use
  1592. * @index: start pgoff of the range within the file
  1593. * @count: length of the range, in unit of PAGE_SIZE
  1594. * @mf_flags: memory failure flags
  1595. */
  1596. int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
  1597. unsigned long count, int mf_flags)
  1598. {
  1599. LIST_HEAD(to_kill);
  1600. dax_entry_t cookie;
  1601. struct page *page;
  1602. size_t end = index + count;
  1603. bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
  1604. mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
  1605. for (; index < end; index++) {
  1606. page = NULL;
  1607. cookie = dax_lock_mapping_entry(mapping, index, &page);
  1608. if (!cookie)
  1609. return -EBUSY;
  1610. if (!page)
  1611. goto unlock;
  1612. if (!pre_remove)
  1613. SetPageHWPoison(page);
  1614. /*
  1615. * The pre_remove case is revoking access, the memory is still
  1616. * good and could theoretically be put back into service.
  1617. */
  1618. collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
  1619. unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
  1620. index, mf_flags);
  1621. unlock:
  1622. dax_unlock_mapping_entry(mapping, index, cookie);
  1623. }
  1624. return 0;
  1625. }
  1626. EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
  1627. #endif /* CONFIG_FS_DAX */
  1628. #ifdef CONFIG_HUGETLB_PAGE
  1629. /*
  1630. * Struct raw_hwp_page represents information about "raw error page",
  1631. * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
  1632. */
  1633. struct raw_hwp_page {
  1634. struct llist_node node;
  1635. struct page *page;
  1636. };
  1637. static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
  1638. {
  1639. return (struct llist_head *)&folio->_hugetlb_hwpoison;
  1640. }
  1641. bool is_raw_hwpoison_page_in_hugepage(struct page *page)
  1642. {
  1643. struct llist_head *raw_hwp_head;
  1644. struct raw_hwp_page *p;
  1645. struct folio *folio = page_folio(page);
  1646. bool ret = false;
  1647. if (!folio_test_hwpoison(folio))
  1648. return false;
  1649. if (!folio_test_hugetlb(folio))
  1650. return PageHWPoison(page);
  1651. /*
  1652. * When RawHwpUnreliable is set, kernel lost track of which subpages
  1653. * are HWPOISON. So return as if ALL subpages are HWPOISONed.
  1654. */
  1655. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1656. return true;
  1657. mutex_lock(&mf_mutex);
  1658. raw_hwp_head = raw_hwp_list_head(folio);
  1659. llist_for_each_entry(p, raw_hwp_head->first, node) {
  1660. if (page == p->page) {
  1661. ret = true;
  1662. break;
  1663. }
  1664. }
  1665. mutex_unlock(&mf_mutex);
  1666. return ret;
  1667. }
  1668. static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
  1669. {
  1670. struct llist_node *head;
  1671. struct raw_hwp_page *p, *next;
  1672. unsigned long count = 0;
  1673. head = llist_del_all(raw_hwp_list_head(folio));
  1674. llist_for_each_entry_safe(p, next, head, node) {
  1675. if (move_flag)
  1676. SetPageHWPoison(p->page);
  1677. else
  1678. num_poisoned_pages_sub(page_to_pfn(p->page), 1);
  1679. kfree(p);
  1680. count++;
  1681. }
  1682. return count;
  1683. }
  1684. static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
  1685. {
  1686. struct llist_head *head;
  1687. struct raw_hwp_page *raw_hwp;
  1688. struct raw_hwp_page *p;
  1689. int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
  1690. /*
  1691. * Once the hwpoison hugepage has lost reliable raw error info,
  1692. * there is little meaning to keep additional error info precisely,
  1693. * so skip to add additional raw error info.
  1694. */
  1695. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1696. return -EHWPOISON;
  1697. head = raw_hwp_list_head(folio);
  1698. llist_for_each_entry(p, head->first, node) {
  1699. if (p->page == page)
  1700. return -EHWPOISON;
  1701. }
  1702. raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
  1703. if (raw_hwp) {
  1704. raw_hwp->page = page;
  1705. llist_add(&raw_hwp->node, head);
  1706. /* the first error event will be counted in action_result(). */
  1707. if (ret)
  1708. num_poisoned_pages_inc(page_to_pfn(page));
  1709. } else {
  1710. /*
  1711. * Failed to save raw error info. We no longer trace all
  1712. * hwpoisoned subpages, and we need refuse to free/dissolve
  1713. * this hwpoisoned hugepage.
  1714. */
  1715. folio_set_hugetlb_raw_hwp_unreliable(folio);
  1716. /*
  1717. * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
  1718. * used any more, so free it.
  1719. */
  1720. __folio_free_raw_hwp(folio, false);
  1721. }
  1722. return ret;
  1723. }
  1724. static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
  1725. {
  1726. /*
  1727. * hugetlb_vmemmap_optimized hugepages can't be freed because struct
  1728. * pages for tail pages are required but they don't exist.
  1729. */
  1730. if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
  1731. return 0;
  1732. /*
  1733. * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
  1734. * definition.
  1735. */
  1736. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1737. return 0;
  1738. return __folio_free_raw_hwp(folio, move_flag);
  1739. }
  1740. void folio_clear_hugetlb_hwpoison(struct folio *folio)
  1741. {
  1742. if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  1743. return;
  1744. if (folio_test_hugetlb_vmemmap_optimized(folio))
  1745. return;
  1746. folio_clear_hwpoison(folio);
  1747. folio_free_raw_hwp(folio, true);
  1748. }
  1749. /*
  1750. * Called from hugetlb code with hugetlb_lock held.
  1751. *
  1752. * Return values:
  1753. * 0 - free hugepage
  1754. * 1 - in-use hugepage
  1755. * 2 - not a hugepage
  1756. * -EBUSY - the hugepage is busy (try to retry)
  1757. * -EHWPOISON - the hugepage is already hwpoisoned
  1758. */
  1759. int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
  1760. bool *migratable_cleared)
  1761. {
  1762. struct page *page = pfn_to_page(pfn);
  1763. struct folio *folio = page_folio(page);
  1764. int ret = 2; /* fallback to normal page handling */
  1765. bool count_increased = false;
  1766. if (!folio_test_hugetlb(folio))
  1767. goto out;
  1768. if (flags & MF_COUNT_INCREASED) {
  1769. ret = 1;
  1770. count_increased = true;
  1771. } else if (folio_test_hugetlb_freed(folio)) {
  1772. ret = 0;
  1773. } else if (folio_test_hugetlb_migratable(folio)) {
  1774. ret = folio_try_get(folio);
  1775. if (ret)
  1776. count_increased = true;
  1777. } else {
  1778. ret = -EBUSY;
  1779. if (!(flags & MF_NO_RETRY))
  1780. goto out;
  1781. }
  1782. if (folio_set_hugetlb_hwpoison(folio, page)) {
  1783. ret = -EHWPOISON;
  1784. goto out;
  1785. }
  1786. /*
  1787. * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
  1788. * from being migrated by memory hotremove.
  1789. */
  1790. if (count_increased && folio_test_hugetlb_migratable(folio)) {
  1791. folio_clear_hugetlb_migratable(folio);
  1792. *migratable_cleared = true;
  1793. }
  1794. return ret;
  1795. out:
  1796. if (count_increased)
  1797. folio_put(folio);
  1798. return ret;
  1799. }
  1800. /*
  1801. * Taking refcount of hugetlb pages needs extra care about race conditions
  1802. * with basic operations like hugepage allocation/free/demotion.
  1803. * So some of prechecks for hwpoison (pinning, and testing/setting
  1804. * PageHWPoison) should be done in single hugetlb_lock range.
  1805. */
  1806. static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  1807. {
  1808. int res;
  1809. struct page *p = pfn_to_page(pfn);
  1810. struct folio *folio;
  1811. unsigned long page_flags;
  1812. bool migratable_cleared = false;
  1813. *hugetlb = 1;
  1814. retry:
  1815. res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
  1816. if (res == 2) { /* fallback to normal page handling */
  1817. *hugetlb = 0;
  1818. return 0;
  1819. } else if (res == -EHWPOISON) {
  1820. pr_err("%#lx: already hardware poisoned\n", pfn);
  1821. if (flags & MF_ACTION_REQUIRED) {
  1822. folio = page_folio(p);
  1823. res = kill_accessing_process(current, folio_pfn(folio), flags);
  1824. action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
  1825. }
  1826. return res;
  1827. } else if (res == -EBUSY) {
  1828. if (!(flags & MF_NO_RETRY)) {
  1829. flags |= MF_NO_RETRY;
  1830. goto retry;
  1831. }
  1832. return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
  1833. }
  1834. folio = page_folio(p);
  1835. folio_lock(folio);
  1836. if (hwpoison_filter(p)) {
  1837. folio_clear_hugetlb_hwpoison(folio);
  1838. if (migratable_cleared)
  1839. folio_set_hugetlb_migratable(folio);
  1840. folio_unlock(folio);
  1841. if (res == 1)
  1842. folio_put(folio);
  1843. return -EOPNOTSUPP;
  1844. }
  1845. /*
  1846. * Handling free hugepage. The possible race with hugepage allocation
  1847. * or demotion can be prevented by PageHWPoison flag.
  1848. */
  1849. if (res == 0) {
  1850. folio_unlock(folio);
  1851. if (__page_handle_poison(p) > 0) {
  1852. page_ref_inc(p);
  1853. res = MF_RECOVERED;
  1854. } else {
  1855. res = MF_FAILED;
  1856. }
  1857. return action_result(pfn, MF_MSG_FREE_HUGE, res);
  1858. }
  1859. page_flags = folio->flags;
  1860. if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
  1861. folio_unlock(folio);
  1862. return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
  1863. }
  1864. return identify_page_state(pfn, p, page_flags);
  1865. }
  1866. #else
  1867. static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  1868. {
  1869. return 0;
  1870. }
  1871. static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
  1872. {
  1873. return 0;
  1874. }
  1875. #endif /* CONFIG_HUGETLB_PAGE */
  1876. /* Drop the extra refcount in case we come from madvise() */
  1877. static void put_ref_page(unsigned long pfn, int flags)
  1878. {
  1879. if (!(flags & MF_COUNT_INCREASED))
  1880. return;
  1881. put_page(pfn_to_page(pfn));
  1882. }
  1883. static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
  1884. struct dev_pagemap *pgmap)
  1885. {
  1886. int rc = -ENXIO;
  1887. /* device metadata space is not recoverable */
  1888. if (!pgmap_pfn_valid(pgmap, pfn))
  1889. goto out;
  1890. /*
  1891. * Call driver's implementation to handle the memory failure, otherwise
  1892. * fall back to generic handler.
  1893. */
  1894. if (pgmap_has_memory_failure(pgmap)) {
  1895. rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
  1896. /*
  1897. * Fall back to generic handler too if operation is not
  1898. * supported inside the driver/device/filesystem.
  1899. */
  1900. if (rc != -EOPNOTSUPP)
  1901. goto out;
  1902. }
  1903. rc = mf_generic_kill_procs(pfn, flags, pgmap);
  1904. out:
  1905. /* drop pgmap ref acquired in caller */
  1906. put_dev_pagemap(pgmap);
  1907. if (rc != -EOPNOTSUPP)
  1908. action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
  1909. return rc;
  1910. }
  1911. /*
  1912. * The calling condition is as such: thp split failed, page might have
  1913. * been RDMA pinned, not much can be done for recovery.
  1914. * But a SIGBUS should be delivered with vaddr provided so that the user
  1915. * application has a chance to recover. Also, application processes'
  1916. * election for MCE early killed will be honored.
  1917. */
  1918. static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
  1919. struct folio *folio)
  1920. {
  1921. LIST_HEAD(tokill);
  1922. collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
  1923. kill_procs(&tokill, true, pfn, flags);
  1924. }
  1925. /**
  1926. * memory_failure - Handle memory failure of a page.
  1927. * @pfn: Page Number of the corrupted page
  1928. * @flags: fine tune action taken
  1929. *
  1930. * This function is called by the low level machine check code
  1931. * of an architecture when it detects hardware memory corruption
  1932. * of a page. It tries its best to recover, which includes
  1933. * dropping pages, killing processes etc.
  1934. *
  1935. * The function is primarily of use for corruptions that
  1936. * happen outside the current execution context (e.g. when
  1937. * detected by a background scrubber)
  1938. *
  1939. * Must run in process context (e.g. a work queue) with interrupts
  1940. * enabled and no spinlocks held.
  1941. *
  1942. * Return: 0 for successfully handled the memory error,
  1943. * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
  1944. * < 0(except -EOPNOTSUPP) on failure.
  1945. */
  1946. int memory_failure(unsigned long pfn, int flags)
  1947. {
  1948. struct page *p;
  1949. struct folio *folio;
  1950. struct dev_pagemap *pgmap;
  1951. int res = 0;
  1952. unsigned long page_flags;
  1953. bool retry = true;
  1954. int hugetlb = 0;
  1955. if (!sysctl_memory_failure_recovery)
  1956. panic("Memory failure on page %lx", pfn);
  1957. mutex_lock(&mf_mutex);
  1958. if (!(flags & MF_SW_SIMULATED))
  1959. hw_memory_failure = true;
  1960. p = pfn_to_online_page(pfn);
  1961. if (!p) {
  1962. res = arch_memory_failure(pfn, flags);
  1963. if (res == 0)
  1964. goto unlock_mutex;
  1965. if (pfn_valid(pfn)) {
  1966. pgmap = get_dev_pagemap(pfn, NULL);
  1967. put_ref_page(pfn, flags);
  1968. if (pgmap) {
  1969. res = memory_failure_dev_pagemap(pfn, flags,
  1970. pgmap);
  1971. goto unlock_mutex;
  1972. }
  1973. }
  1974. pr_err("%#lx: memory outside kernel control\n", pfn);
  1975. res = -ENXIO;
  1976. goto unlock_mutex;
  1977. }
  1978. try_again:
  1979. res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
  1980. if (hugetlb)
  1981. goto unlock_mutex;
  1982. if (TestSetPageHWPoison(p)) {
  1983. pr_err("%#lx: already hardware poisoned\n", pfn);
  1984. res = -EHWPOISON;
  1985. if (flags & MF_ACTION_REQUIRED)
  1986. res = kill_accessing_process(current, pfn, flags);
  1987. if (flags & MF_COUNT_INCREASED)
  1988. put_page(p);
  1989. action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
  1990. goto unlock_mutex;
  1991. }
  1992. /*
  1993. * We need/can do nothing about count=0 pages.
  1994. * 1) it's a free page, and therefore in safe hand:
  1995. * check_new_page() will be the gate keeper.
  1996. * 2) it's part of a non-compound high order page.
  1997. * Implies some kernel user: cannot stop them from
  1998. * R/W the page; let's pray that the page has been
  1999. * used and will be freed some time later.
  2000. * In fact it's dangerous to directly bump up page count from 0,
  2001. * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
  2002. */
  2003. if (!(flags & MF_COUNT_INCREASED)) {
  2004. res = get_hwpoison_page(p, flags);
  2005. if (!res) {
  2006. if (is_free_buddy_page(p)) {
  2007. if (take_page_off_buddy(p)) {
  2008. page_ref_inc(p);
  2009. res = MF_RECOVERED;
  2010. } else {
  2011. /* We lost the race, try again */
  2012. if (retry) {
  2013. ClearPageHWPoison(p);
  2014. retry = false;
  2015. goto try_again;
  2016. }
  2017. res = MF_FAILED;
  2018. }
  2019. res = action_result(pfn, MF_MSG_BUDDY, res);
  2020. } else {
  2021. res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
  2022. }
  2023. goto unlock_mutex;
  2024. } else if (res < 0) {
  2025. res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
  2026. goto unlock_mutex;
  2027. }
  2028. }
  2029. folio = page_folio(p);
  2030. /* filter pages that are protected from hwpoison test by users */
  2031. folio_lock(folio);
  2032. if (hwpoison_filter(p)) {
  2033. ClearPageHWPoison(p);
  2034. folio_unlock(folio);
  2035. folio_put(folio);
  2036. res = -EOPNOTSUPP;
  2037. goto unlock_mutex;
  2038. }
  2039. folio_unlock(folio);
  2040. if (folio_test_large(folio)) {
  2041. /*
  2042. * The flag must be set after the refcount is bumped
  2043. * otherwise it may race with THP split.
  2044. * And the flag can't be set in get_hwpoison_page() since
  2045. * it is called by soft offline too and it is just called
  2046. * for !MF_COUNT_INCREASED. So here seems to be the best
  2047. * place.
  2048. *
  2049. * Don't need care about the above error handling paths for
  2050. * get_hwpoison_page() since they handle either free page
  2051. * or unhandlable page. The refcount is bumped iff the
  2052. * page is a valid handlable page.
  2053. */
  2054. folio_set_has_hwpoisoned(folio);
  2055. if (try_to_split_thp_page(p, false) < 0) {
  2056. res = -EHWPOISON;
  2057. kill_procs_now(p, pfn, flags, folio);
  2058. put_page(p);
  2059. action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED);
  2060. goto unlock_mutex;
  2061. }
  2062. VM_BUG_ON_PAGE(!page_count(p), p);
  2063. folio = page_folio(p);
  2064. }
  2065. /*
  2066. * We ignore non-LRU pages for good reasons.
  2067. * - PG_locked is only well defined for LRU pages and a few others
  2068. * - to avoid races with __SetPageLocked()
  2069. * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
  2070. * The check (unnecessarily) ignores LRU pages being isolated and
  2071. * walked by the page reclaim code, however that's not a big loss.
  2072. */
  2073. shake_folio(folio);
  2074. folio_lock(folio);
  2075. /*
  2076. * We're only intended to deal with the non-Compound page here.
  2077. * The page cannot become compound pages again as folio has been
  2078. * splited and extra refcnt is held.
  2079. */
  2080. WARN_ON(folio_test_large(folio));
  2081. /*
  2082. * We use page flags to determine what action should be taken, but
  2083. * the flags can be modified by the error containment action. One
  2084. * example is an mlocked page, where PG_mlocked is cleared by
  2085. * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
  2086. * status correctly, we save a copy of the page flags at this time.
  2087. */
  2088. page_flags = folio->flags;
  2089. /*
  2090. * __munlock_folio() may clear a writeback folio's LRU flag without
  2091. * the folio lock. We need to wait for writeback completion for this
  2092. * folio or it may trigger a vfs BUG while evicting inode.
  2093. */
  2094. if (!folio_test_lru(folio) && !folio_test_writeback(folio))
  2095. goto identify_page_state;
  2096. /*
  2097. * It's very difficult to mess with pages currently under IO
  2098. * and in many cases impossible, so we just avoid it here.
  2099. */
  2100. folio_wait_writeback(folio);
  2101. /*
  2102. * Now take care of user space mappings.
  2103. * Abort on fail: __filemap_remove_folio() assumes unmapped page.
  2104. */
  2105. if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
  2106. res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
  2107. goto unlock_page;
  2108. }
  2109. /*
  2110. * Torn down by someone else?
  2111. */
  2112. if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
  2113. folio->mapping == NULL) {
  2114. res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
  2115. goto unlock_page;
  2116. }
  2117. identify_page_state:
  2118. res = identify_page_state(pfn, p, page_flags);
  2119. mutex_unlock(&mf_mutex);
  2120. return res;
  2121. unlock_page:
  2122. folio_unlock(folio);
  2123. unlock_mutex:
  2124. mutex_unlock(&mf_mutex);
  2125. return res;
  2126. }
  2127. EXPORT_SYMBOL_GPL(memory_failure);
  2128. #define MEMORY_FAILURE_FIFO_ORDER 4
  2129. #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
  2130. struct memory_failure_entry {
  2131. unsigned long pfn;
  2132. int flags;
  2133. };
  2134. struct memory_failure_cpu {
  2135. DECLARE_KFIFO(fifo, struct memory_failure_entry,
  2136. MEMORY_FAILURE_FIFO_SIZE);
  2137. raw_spinlock_t lock;
  2138. struct work_struct work;
  2139. };
  2140. static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
  2141. /**
  2142. * memory_failure_queue - Schedule handling memory failure of a page.
  2143. * @pfn: Page Number of the corrupted page
  2144. * @flags: Flags for memory failure handling
  2145. *
  2146. * This function is called by the low level hardware error handler
  2147. * when it detects hardware memory corruption of a page. It schedules
  2148. * the recovering of error page, including dropping pages, killing
  2149. * processes etc.
  2150. *
  2151. * The function is primarily of use for corruptions that
  2152. * happen outside the current execution context (e.g. when
  2153. * detected by a background scrubber)
  2154. *
  2155. * Can run in IRQ context.
  2156. */
  2157. void memory_failure_queue(unsigned long pfn, int flags)
  2158. {
  2159. struct memory_failure_cpu *mf_cpu;
  2160. unsigned long proc_flags;
  2161. bool buffer_overflow;
  2162. struct memory_failure_entry entry = {
  2163. .pfn = pfn,
  2164. .flags = flags,
  2165. };
  2166. mf_cpu = &get_cpu_var(memory_failure_cpu);
  2167. raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
  2168. buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry);
  2169. if (!buffer_overflow)
  2170. schedule_work_on(smp_processor_id(), &mf_cpu->work);
  2171. raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
  2172. put_cpu_var(memory_failure_cpu);
  2173. if (buffer_overflow)
  2174. pr_err("buffer overflow when queuing memory failure at %#lx\n",
  2175. pfn);
  2176. }
  2177. EXPORT_SYMBOL_GPL(memory_failure_queue);
  2178. static void memory_failure_work_func(struct work_struct *work)
  2179. {
  2180. struct memory_failure_cpu *mf_cpu;
  2181. struct memory_failure_entry entry = { 0, };
  2182. unsigned long proc_flags;
  2183. int gotten;
  2184. mf_cpu = container_of(work, struct memory_failure_cpu, work);
  2185. for (;;) {
  2186. raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
  2187. gotten = kfifo_get(&mf_cpu->fifo, &entry);
  2188. raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
  2189. if (!gotten)
  2190. break;
  2191. if (entry.flags & MF_SOFT_OFFLINE)
  2192. soft_offline_page(entry.pfn, entry.flags);
  2193. else
  2194. memory_failure(entry.pfn, entry.flags);
  2195. }
  2196. }
  2197. /*
  2198. * Process memory_failure work queued on the specified CPU.
  2199. * Used to avoid return-to-userspace racing with the memory_failure workqueue.
  2200. */
  2201. void memory_failure_queue_kick(int cpu)
  2202. {
  2203. struct memory_failure_cpu *mf_cpu;
  2204. mf_cpu = &per_cpu(memory_failure_cpu, cpu);
  2205. cancel_work_sync(&mf_cpu->work);
  2206. memory_failure_work_func(&mf_cpu->work);
  2207. }
  2208. static int __init memory_failure_init(void)
  2209. {
  2210. struct memory_failure_cpu *mf_cpu;
  2211. int cpu;
  2212. for_each_possible_cpu(cpu) {
  2213. mf_cpu = &per_cpu(memory_failure_cpu, cpu);
  2214. raw_spin_lock_init(&mf_cpu->lock);
  2215. INIT_KFIFO(mf_cpu->fifo);
  2216. INIT_WORK(&mf_cpu->work, memory_failure_work_func);
  2217. }
  2218. register_sysctl_init("vm", memory_failure_table);
  2219. return 0;
  2220. }
  2221. core_initcall(memory_failure_init);
  2222. #undef pr_fmt
  2223. #define pr_fmt(fmt) "Unpoison: " fmt
  2224. #define unpoison_pr_info(fmt, pfn, rs) \
  2225. ({ \
  2226. if (__ratelimit(rs)) \
  2227. pr_info(fmt, pfn); \
  2228. })
  2229. /**
  2230. * unpoison_memory - Unpoison a previously poisoned page
  2231. * @pfn: Page number of the to be unpoisoned page
  2232. *
  2233. * Software-unpoison a page that has been poisoned by
  2234. * memory_failure() earlier.
  2235. *
  2236. * This is only done on the software-level, so it only works
  2237. * for linux injected failures, not real hardware failures
  2238. *
  2239. * Returns 0 for success, otherwise -errno.
  2240. */
  2241. int unpoison_memory(unsigned long pfn)
  2242. {
  2243. struct folio *folio;
  2244. struct page *p;
  2245. int ret = -EBUSY, ghp;
  2246. unsigned long count;
  2247. bool huge = false;
  2248. static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
  2249. DEFAULT_RATELIMIT_BURST);
  2250. if (!pfn_valid(pfn))
  2251. return -ENXIO;
  2252. p = pfn_to_page(pfn);
  2253. folio = page_folio(p);
  2254. mutex_lock(&mf_mutex);
  2255. if (hw_memory_failure) {
  2256. unpoison_pr_info("%#lx: disabled after HW memory failure\n",
  2257. pfn, &unpoison_rs);
  2258. ret = -EOPNOTSUPP;
  2259. goto unlock_mutex;
  2260. }
  2261. if (is_huge_zero_folio(folio)) {
  2262. unpoison_pr_info("%#lx: huge zero page is not supported\n",
  2263. pfn, &unpoison_rs);
  2264. ret = -EOPNOTSUPP;
  2265. goto unlock_mutex;
  2266. }
  2267. if (!PageHWPoison(p)) {
  2268. unpoison_pr_info("%#lx: page was already unpoisoned\n",
  2269. pfn, &unpoison_rs);
  2270. goto unlock_mutex;
  2271. }
  2272. if (folio_ref_count(folio) > 1) {
  2273. unpoison_pr_info("%#lx: someone grabs the hwpoison page\n",
  2274. pfn, &unpoison_rs);
  2275. goto unlock_mutex;
  2276. }
  2277. if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
  2278. folio_test_reserved(folio) || folio_test_offline(folio))
  2279. goto unlock_mutex;
  2280. if (folio_mapped(folio)) {
  2281. unpoison_pr_info("%#lx: someone maps the hwpoison page\n",
  2282. pfn, &unpoison_rs);
  2283. goto unlock_mutex;
  2284. }
  2285. if (folio_mapping(folio)) {
  2286. unpoison_pr_info("%#lx: the hwpoison page has non-NULL mapping\n",
  2287. pfn, &unpoison_rs);
  2288. goto unlock_mutex;
  2289. }
  2290. ghp = get_hwpoison_page(p, MF_UNPOISON);
  2291. if (!ghp) {
  2292. if (folio_test_hugetlb(folio)) {
  2293. huge = true;
  2294. count = folio_free_raw_hwp(folio, false);
  2295. if (count == 0)
  2296. goto unlock_mutex;
  2297. }
  2298. ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
  2299. } else if (ghp < 0) {
  2300. if (ghp == -EHWPOISON) {
  2301. ret = put_page_back_buddy(p) ? 0 : -EBUSY;
  2302. } else {
  2303. ret = ghp;
  2304. unpoison_pr_info("%#lx: failed to grab page\n",
  2305. pfn, &unpoison_rs);
  2306. }
  2307. } else {
  2308. if (folio_test_hugetlb(folio)) {
  2309. huge = true;
  2310. count = folio_free_raw_hwp(folio, false);
  2311. if (count == 0) {
  2312. folio_put(folio);
  2313. goto unlock_mutex;
  2314. }
  2315. }
  2316. folio_put(folio);
  2317. if (TestClearPageHWPoison(p)) {
  2318. folio_put(folio);
  2319. ret = 0;
  2320. }
  2321. }
  2322. unlock_mutex:
  2323. mutex_unlock(&mf_mutex);
  2324. if (!ret) {
  2325. if (!huge)
  2326. num_poisoned_pages_sub(pfn, 1);
  2327. unpoison_pr_info("%#lx: software-unpoisoned page\n",
  2328. page_to_pfn(p), &unpoison_rs);
  2329. }
  2330. return ret;
  2331. }
  2332. EXPORT_SYMBOL(unpoison_memory);
  2333. #undef pr_fmt
  2334. #define pr_fmt(fmt) "Soft offline: " fmt
  2335. /*
  2336. * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
  2337. * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
  2338. * If the page is mapped, it migrates the contents over.
  2339. */
  2340. static int soft_offline_in_use_page(struct page *page)
  2341. {
  2342. long ret = 0;
  2343. unsigned long pfn = page_to_pfn(page);
  2344. struct folio *folio = page_folio(page);
  2345. char const *msg_page[] = {"page", "hugepage"};
  2346. bool huge = folio_test_hugetlb(folio);
  2347. bool isolated;
  2348. LIST_HEAD(pagelist);
  2349. struct migration_target_control mtc = {
  2350. .nid = NUMA_NO_NODE,
  2351. .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
  2352. .reason = MR_MEMORY_FAILURE,
  2353. };
  2354. if (!huge && folio_test_large(folio)) {
  2355. if (try_to_split_thp_page(page, true)) {
  2356. pr_info("%#lx: thp split failed\n", pfn);
  2357. return -EBUSY;
  2358. }
  2359. folio = page_folio(page);
  2360. }
  2361. folio_lock(folio);
  2362. if (!huge)
  2363. folio_wait_writeback(folio);
  2364. if (PageHWPoison(page)) {
  2365. folio_unlock(folio);
  2366. folio_put(folio);
  2367. pr_info("%#lx: page already poisoned\n", pfn);
  2368. return 0;
  2369. }
  2370. if (!huge && folio_test_lru(folio) && !folio_test_swapcache(folio))
  2371. /*
  2372. * Try to invalidate first. This should work for
  2373. * non dirty unmapped page cache pages.
  2374. */
  2375. ret = mapping_evict_folio(folio_mapping(folio), folio);
  2376. folio_unlock(folio);
  2377. if (ret) {
  2378. pr_info("%#lx: invalidated\n", pfn);
  2379. page_handle_poison(page, false, true);
  2380. return 0;
  2381. }
  2382. isolated = isolate_folio_to_list(folio, &pagelist);
  2383. /*
  2384. * If we succeed to isolate the folio, we grabbed another refcount on
  2385. * the folio, so we can safely drop the one we got from get_any_page().
  2386. * If we failed to isolate the folio, it means that we cannot go further
  2387. * and we will return an error, so drop the reference we got from
  2388. * get_any_page() as well.
  2389. */
  2390. folio_put(folio);
  2391. if (isolated) {
  2392. ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
  2393. (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
  2394. if (!ret) {
  2395. bool release = !huge;
  2396. if (!page_handle_poison(page, huge, release))
  2397. ret = -EBUSY;
  2398. } else {
  2399. if (!list_empty(&pagelist))
  2400. putback_movable_pages(&pagelist);
  2401. pr_info("%#lx: %s migration failed %ld, type %pGp\n",
  2402. pfn, msg_page[huge], ret, &page->flags);
  2403. if (ret > 0)
  2404. ret = -EBUSY;
  2405. }
  2406. } else {
  2407. pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
  2408. pfn, msg_page[huge], page_count(page), &page->flags);
  2409. ret = -EBUSY;
  2410. }
  2411. return ret;
  2412. }
  2413. /**
  2414. * soft_offline_page - Soft offline a page.
  2415. * @pfn: pfn to soft-offline
  2416. * @flags: flags. Same as memory_failure().
  2417. *
  2418. * Returns 0 on success,
  2419. * -EOPNOTSUPP for hwpoison_filter() filtered the error event, or
  2420. * disabled by /proc/sys/vm/enable_soft_offline,
  2421. * < 0 otherwise negated errno.
  2422. *
  2423. * Soft offline a page, by migration or invalidation,
  2424. * without killing anything. This is for the case when
  2425. * a page is not corrupted yet (so it's still valid to access),
  2426. * but has had a number of corrected errors and is better taken
  2427. * out.
  2428. *
  2429. * The actual policy on when to do that is maintained by
  2430. * user space.
  2431. *
  2432. * This should never impact any application or cause data loss,
  2433. * however it might take some time.
  2434. *
  2435. * This is not a 100% solution for all memory, but tries to be
  2436. * ``good enough'' for the majority of memory.
  2437. */
  2438. int soft_offline_page(unsigned long pfn, int flags)
  2439. {
  2440. int ret;
  2441. bool try_again = true;
  2442. struct page *page;
  2443. if (!pfn_valid(pfn)) {
  2444. WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
  2445. return -ENXIO;
  2446. }
  2447. /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
  2448. page = pfn_to_online_page(pfn);
  2449. if (!page) {
  2450. put_ref_page(pfn, flags);
  2451. return -EIO;
  2452. }
  2453. if (!sysctl_enable_soft_offline) {
  2454. pr_info_once("disabled by /proc/sys/vm/enable_soft_offline\n");
  2455. put_ref_page(pfn, flags);
  2456. return -EOPNOTSUPP;
  2457. }
  2458. mutex_lock(&mf_mutex);
  2459. if (PageHWPoison(page)) {
  2460. pr_info("%#lx: page already poisoned\n", pfn);
  2461. put_ref_page(pfn, flags);
  2462. mutex_unlock(&mf_mutex);
  2463. return 0;
  2464. }
  2465. retry:
  2466. get_online_mems();
  2467. ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
  2468. put_online_mems();
  2469. if (hwpoison_filter(page)) {
  2470. if (ret > 0)
  2471. put_page(page);
  2472. mutex_unlock(&mf_mutex);
  2473. return -EOPNOTSUPP;
  2474. }
  2475. if (ret > 0) {
  2476. ret = soft_offline_in_use_page(page);
  2477. } else if (ret == 0) {
  2478. if (!page_handle_poison(page, true, false)) {
  2479. if (try_again) {
  2480. try_again = false;
  2481. flags &= ~MF_COUNT_INCREASED;
  2482. goto retry;
  2483. }
  2484. ret = -EBUSY;
  2485. }
  2486. }
  2487. mutex_unlock(&mf_mutex);
  2488. return ret;
  2489. }