fault.c 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 1995 Linus Torvalds
  4. * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
  5. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  6. */
  7. #include <linux/sched.h> /* test_thread_flag(), ... */
  8. #include <linux/sched/task_stack.h> /* task_stack_*(), ... */
  9. #include <linux/kdebug.h> /* oops_begin/end, ... */
  10. #include <linux/extable.h> /* search_exception_tables */
  11. #include <linux/memblock.h> /* max_low_pfn */
  12. #include <linux/kfence.h> /* kfence_handle_page_fault */
  13. #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
  14. #include <linux/mmiotrace.h> /* kmmio_handler, ... */
  15. #include <linux/perf_event.h> /* perf_sw_event */
  16. #include <linux/hugetlb.h> /* hstate_index_to_shift */
  17. #include <linux/prefetch.h> /* prefetchw */
  18. #include <linux/context_tracking.h> /* exception_enter(), ... */
  19. #include <linux/uaccess.h> /* faulthandler_disabled() */
  20. #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
  21. #include <linux/mm_types.h>
  22. #include <linux/mm.h> /* find_and_lock_vma() */
  23. #include <linux/vmalloc.h>
  24. #include <asm/cpufeature.h> /* boot_cpu_has, ... */
  25. #include <asm/traps.h> /* dotraplinkage, ... */
  26. #include <asm/fixmap.h> /* VSYSCALL_ADDR */
  27. #include <asm/vsyscall.h> /* emulate_vsyscall */
  28. #include <asm/vm86.h> /* struct vm86 */
  29. #include <asm/mmu_context.h> /* vma_pkey() */
  30. #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
  31. #include <asm/desc.h> /* store_idt(), ... */
  32. #include <asm/cpu_entry_area.h> /* exception stack */
  33. #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
  34. #include <asm/kvm_para.h> /* kvm_handle_async_pf */
  35. #include <asm/vdso.h> /* fixup_vdso_exception() */
  36. #include <asm/irq_stack.h>
  37. #include <asm/fred.h>
  38. #include <asm/sev.h> /* snp_dump_hva_rmpentry() */
  39. #define CREATE_TRACE_POINTS
  40. #include <asm/trace/exceptions.h>
  41. /*
  42. * Returns 0 if mmiotrace is disabled, or if the fault is not
  43. * handled by mmiotrace:
  44. */
  45. static nokprobe_inline int
  46. kmmio_fault(struct pt_regs *regs, unsigned long addr)
  47. {
  48. if (unlikely(is_kmmio_active()))
  49. if (kmmio_handler(regs, addr) == 1)
  50. return -1;
  51. return 0;
  52. }
  53. /*
  54. * Prefetch quirks:
  55. *
  56. * 32-bit mode:
  57. *
  58. * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  59. * Check that here and ignore it. This is AMD erratum #91.
  60. *
  61. * 64-bit mode:
  62. *
  63. * Sometimes the CPU reports invalid exceptions on prefetch.
  64. * Check that here and ignore it.
  65. *
  66. * Opcode checker based on code by Richard Brunner.
  67. */
  68. static inline int
  69. check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
  70. unsigned char opcode, int *prefetch)
  71. {
  72. unsigned char instr_hi = opcode & 0xf0;
  73. unsigned char instr_lo = opcode & 0x0f;
  74. switch (instr_hi) {
  75. case 0x20:
  76. case 0x30:
  77. /*
  78. * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
  79. * In X86_64 long mode, the CPU will signal invalid
  80. * opcode if some of these prefixes are present so
  81. * X86_64 will never get here anyway
  82. */
  83. return ((instr_lo & 7) == 0x6);
  84. #ifdef CONFIG_X86_64
  85. case 0x40:
  86. /*
  87. * In 64-bit mode 0x40..0x4F are valid REX prefixes
  88. */
  89. return (!user_mode(regs) || user_64bit_mode(regs));
  90. #endif
  91. case 0x60:
  92. /* 0x64 thru 0x67 are valid prefixes in all modes. */
  93. return (instr_lo & 0xC) == 0x4;
  94. case 0xF0:
  95. /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
  96. return !instr_lo || (instr_lo>>1) == 1;
  97. case 0x00:
  98. /* Prefetch instruction is 0x0F0D or 0x0F18 */
  99. if (get_kernel_nofault(opcode, instr))
  100. return 0;
  101. *prefetch = (instr_lo == 0xF) &&
  102. (opcode == 0x0D || opcode == 0x18);
  103. return 0;
  104. default:
  105. return 0;
  106. }
  107. }
  108. static bool is_amd_k8_pre_npt(void)
  109. {
  110. struct cpuinfo_x86 *c = &boot_cpu_data;
  111. return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
  112. c->x86_vendor == X86_VENDOR_AMD &&
  113. c->x86 == 0xf && c->x86_model < 0x40);
  114. }
  115. static int
  116. is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  117. {
  118. unsigned char *max_instr;
  119. unsigned char *instr;
  120. int prefetch = 0;
  121. /* Erratum #91 affects AMD K8, pre-NPT CPUs */
  122. if (!is_amd_k8_pre_npt())
  123. return 0;
  124. /*
  125. * If it was a exec (instruction fetch) fault on NX page, then
  126. * do not ignore the fault:
  127. */
  128. if (error_code & X86_PF_INSTR)
  129. return 0;
  130. instr = (void *)convert_ip_to_linear(current, regs);
  131. max_instr = instr + 15;
  132. /*
  133. * This code has historically always bailed out if IP points to a
  134. * not-present page (e.g. due to a race). No one has ever
  135. * complained about this.
  136. */
  137. pagefault_disable();
  138. while (instr < max_instr) {
  139. unsigned char opcode;
  140. if (user_mode(regs)) {
  141. if (get_user(opcode, (unsigned char __user *) instr))
  142. break;
  143. } else {
  144. if (get_kernel_nofault(opcode, instr))
  145. break;
  146. }
  147. instr++;
  148. if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
  149. break;
  150. }
  151. pagefault_enable();
  152. return prefetch;
  153. }
  154. DEFINE_SPINLOCK(pgd_lock);
  155. LIST_HEAD(pgd_list);
  156. #ifdef CONFIG_X86_32
  157. static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
  158. {
  159. unsigned index = pgd_index(address);
  160. pgd_t *pgd_k;
  161. p4d_t *p4d, *p4d_k;
  162. pud_t *pud, *pud_k;
  163. pmd_t *pmd, *pmd_k;
  164. pgd += index;
  165. pgd_k = init_mm.pgd + index;
  166. if (!pgd_present(*pgd_k))
  167. return NULL;
  168. /*
  169. * set_pgd(pgd, *pgd_k); here would be useless on PAE
  170. * and redundant with the set_pmd() on non-PAE. As would
  171. * set_p4d/set_pud.
  172. */
  173. p4d = p4d_offset(pgd, address);
  174. p4d_k = p4d_offset(pgd_k, address);
  175. if (!p4d_present(*p4d_k))
  176. return NULL;
  177. pud = pud_offset(p4d, address);
  178. pud_k = pud_offset(p4d_k, address);
  179. if (!pud_present(*pud_k))
  180. return NULL;
  181. pmd = pmd_offset(pud, address);
  182. pmd_k = pmd_offset(pud_k, address);
  183. if (pmd_present(*pmd) != pmd_present(*pmd_k))
  184. set_pmd(pmd, *pmd_k);
  185. if (!pmd_present(*pmd_k))
  186. return NULL;
  187. else
  188. BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
  189. return pmd_k;
  190. }
  191. /*
  192. * Handle a fault on the vmalloc or module mapping area
  193. *
  194. * This is needed because there is a race condition between the time
  195. * when the vmalloc mapping code updates the PMD to the point in time
  196. * where it synchronizes this update with the other page-tables in the
  197. * system.
  198. *
  199. * In this race window another thread/CPU can map an area on the same
  200. * PMD, finds it already present and does not synchronize it with the
  201. * rest of the system yet. As a result v[mz]alloc might return areas
  202. * which are not mapped in every page-table in the system, causing an
  203. * unhandled page-fault when they are accessed.
  204. */
  205. static noinline int vmalloc_fault(unsigned long address)
  206. {
  207. unsigned long pgd_paddr;
  208. pmd_t *pmd_k;
  209. pte_t *pte_k;
  210. /* Make sure we are in vmalloc area: */
  211. if (!(address >= VMALLOC_START && address < VMALLOC_END))
  212. return -1;
  213. /*
  214. * Synchronize this task's top level page-table
  215. * with the 'reference' page table.
  216. *
  217. * Do _not_ use "current" here. We might be inside
  218. * an interrupt in the middle of a task switch..
  219. */
  220. pgd_paddr = read_cr3_pa();
  221. pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
  222. if (!pmd_k)
  223. return -1;
  224. if (pmd_leaf(*pmd_k))
  225. return 0;
  226. pte_k = pte_offset_kernel(pmd_k, address);
  227. if (!pte_present(*pte_k))
  228. return -1;
  229. return 0;
  230. }
  231. NOKPROBE_SYMBOL(vmalloc_fault);
  232. void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
  233. {
  234. unsigned long addr;
  235. for (addr = start & PMD_MASK;
  236. addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
  237. addr += PMD_SIZE) {
  238. struct page *page;
  239. spin_lock(&pgd_lock);
  240. list_for_each_entry(page, &pgd_list, lru) {
  241. spinlock_t *pgt_lock;
  242. /* the pgt_lock only for Xen */
  243. pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  244. spin_lock(pgt_lock);
  245. vmalloc_sync_one(page_address(page), addr);
  246. spin_unlock(pgt_lock);
  247. }
  248. spin_unlock(&pgd_lock);
  249. }
  250. }
  251. static bool low_pfn(unsigned long pfn)
  252. {
  253. return pfn < max_low_pfn;
  254. }
  255. static void dump_pagetable(unsigned long address)
  256. {
  257. pgd_t *base = __va(read_cr3_pa());
  258. pgd_t *pgd = &base[pgd_index(address)];
  259. p4d_t *p4d;
  260. pud_t *pud;
  261. pmd_t *pmd;
  262. pte_t *pte;
  263. #ifdef CONFIG_X86_PAE
  264. pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
  265. if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
  266. goto out;
  267. #define pr_pde pr_cont
  268. #else
  269. #define pr_pde pr_info
  270. #endif
  271. p4d = p4d_offset(pgd, address);
  272. pud = pud_offset(p4d, address);
  273. pmd = pmd_offset(pud, address);
  274. pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
  275. #undef pr_pde
  276. /*
  277. * We must not directly access the pte in the highpte
  278. * case if the page table is located in highmem.
  279. * And let's rather not kmap-atomic the pte, just in case
  280. * it's allocated already:
  281. */
  282. if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
  283. goto out;
  284. pte = pte_offset_kernel(pmd, address);
  285. pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
  286. out:
  287. pr_cont("\n");
  288. }
  289. #else /* CONFIG_X86_64: */
  290. #ifdef CONFIG_CPU_SUP_AMD
  291. static const char errata93_warning[] =
  292. KERN_ERR
  293. "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
  294. "******* Working around it, but it may cause SEGVs or burn power.\n"
  295. "******* Please consider a BIOS update.\n"
  296. "******* Disabling USB legacy in the BIOS may also help.\n";
  297. #endif
  298. static int bad_address(void *p)
  299. {
  300. unsigned long dummy;
  301. return get_kernel_nofault(dummy, (unsigned long *)p);
  302. }
  303. static void dump_pagetable(unsigned long address)
  304. {
  305. pgd_t *base = __va(read_cr3_pa());
  306. pgd_t *pgd = base + pgd_index(address);
  307. p4d_t *p4d;
  308. pud_t *pud;
  309. pmd_t *pmd;
  310. pte_t *pte;
  311. if (bad_address(pgd))
  312. goto bad;
  313. pr_info("PGD %lx ", pgd_val(*pgd));
  314. if (!pgd_present(*pgd))
  315. goto out;
  316. p4d = p4d_offset(pgd, address);
  317. if (bad_address(p4d))
  318. goto bad;
  319. pr_cont("P4D %lx ", p4d_val(*p4d));
  320. if (!p4d_present(*p4d) || p4d_leaf(*p4d))
  321. goto out;
  322. pud = pud_offset(p4d, address);
  323. if (bad_address(pud))
  324. goto bad;
  325. pr_cont("PUD %lx ", pud_val(*pud));
  326. if (!pud_present(*pud) || pud_leaf(*pud))
  327. goto out;
  328. pmd = pmd_offset(pud, address);
  329. if (bad_address(pmd))
  330. goto bad;
  331. pr_cont("PMD %lx ", pmd_val(*pmd));
  332. if (!pmd_present(*pmd) || pmd_leaf(*pmd))
  333. goto out;
  334. pte = pte_offset_kernel(pmd, address);
  335. if (bad_address(pte))
  336. goto bad;
  337. pr_cont("PTE %lx", pte_val(*pte));
  338. out:
  339. pr_cont("\n");
  340. return;
  341. bad:
  342. pr_info("BAD\n");
  343. }
  344. #endif /* CONFIG_X86_64 */
  345. /*
  346. * Workaround for K8 erratum #93 & buggy BIOS.
  347. *
  348. * BIOS SMM functions are required to use a specific workaround
  349. * to avoid corruption of the 64bit RIP register on C stepping K8.
  350. *
  351. * A lot of BIOS that didn't get tested properly miss this.
  352. *
  353. * The OS sees this as a page fault with the upper 32bits of RIP cleared.
  354. * Try to work around it here.
  355. *
  356. * Note we only handle faults in kernel here.
  357. * Does nothing on 32-bit.
  358. */
  359. static int is_errata93(struct pt_regs *regs, unsigned long address)
  360. {
  361. #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
  362. if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
  363. || boot_cpu_data.x86 != 0xf)
  364. return 0;
  365. if (user_mode(regs))
  366. return 0;
  367. if (address != regs->ip)
  368. return 0;
  369. if ((address >> 32) != 0)
  370. return 0;
  371. address |= 0xffffffffUL << 32;
  372. if ((address >= (u64)_stext && address <= (u64)_etext) ||
  373. (address >= MODULES_VADDR && address <= MODULES_END)) {
  374. printk_once(errata93_warning);
  375. regs->ip = address;
  376. return 1;
  377. }
  378. #endif
  379. return 0;
  380. }
  381. /*
  382. * Work around K8 erratum #100 K8 in compat mode occasionally jumps
  383. * to illegal addresses >4GB.
  384. *
  385. * We catch this in the page fault handler because these addresses
  386. * are not reachable. Just detect this case and return. Any code
  387. * segment in LDT is compatibility mode.
  388. */
  389. static int is_errata100(struct pt_regs *regs, unsigned long address)
  390. {
  391. #ifdef CONFIG_X86_64
  392. if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
  393. return 1;
  394. #endif
  395. return 0;
  396. }
  397. /* Pentium F0 0F C7 C8 bug workaround: */
  398. static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
  399. unsigned long address)
  400. {
  401. #ifdef CONFIG_X86_F00F_BUG
  402. if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
  403. idt_is_f00f_address(address)) {
  404. handle_invalid_op(regs);
  405. return 1;
  406. }
  407. #endif
  408. return 0;
  409. }
  410. static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
  411. {
  412. u32 offset = (index >> 3) * sizeof(struct desc_struct);
  413. unsigned long addr;
  414. struct ldttss_desc desc;
  415. if (index == 0) {
  416. pr_alert("%s: NULL\n", name);
  417. return;
  418. }
  419. if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
  420. pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
  421. return;
  422. }
  423. if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
  424. sizeof(struct ldttss_desc))) {
  425. pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
  426. name, index);
  427. return;
  428. }
  429. addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
  430. #ifdef CONFIG_X86_64
  431. addr |= ((u64)desc.base3 << 32);
  432. #endif
  433. pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
  434. name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
  435. }
  436. static void
  437. show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
  438. {
  439. if (!oops_may_print())
  440. return;
  441. if (error_code & X86_PF_INSTR) {
  442. unsigned int level;
  443. bool nx, rw;
  444. pgd_t *pgd;
  445. pte_t *pte;
  446. pgd = __va(read_cr3_pa());
  447. pgd += pgd_index(address);
  448. pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
  449. if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
  450. pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
  451. from_kuid(&init_user_ns, current_uid()));
  452. if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
  453. (pgd_flags(*pgd) & _PAGE_USER) &&
  454. (__read_cr4() & X86_CR4_SMEP))
  455. pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
  456. from_kuid(&init_user_ns, current_uid()));
  457. }
  458. if (address < PAGE_SIZE && !user_mode(regs))
  459. pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
  460. (void *)address);
  461. else
  462. pr_alert("BUG: unable to handle page fault for address: %px\n",
  463. (void *)address);
  464. pr_alert("#PF: %s %s in %s mode\n",
  465. (error_code & X86_PF_USER) ? "user" : "supervisor",
  466. (error_code & X86_PF_INSTR) ? "instruction fetch" :
  467. (error_code & X86_PF_WRITE) ? "write access" :
  468. "read access",
  469. user_mode(regs) ? "user" : "kernel");
  470. pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
  471. !(error_code & X86_PF_PROT) ? "not-present page" :
  472. (error_code & X86_PF_RSVD) ? "reserved bit violation" :
  473. (error_code & X86_PF_PK) ? "protection keys violation" :
  474. (error_code & X86_PF_RMP) ? "RMP violation" :
  475. "permissions violation");
  476. if (!(error_code & X86_PF_USER) && user_mode(regs)) {
  477. struct desc_ptr idt, gdt;
  478. u16 ldtr, tr;
  479. /*
  480. * This can happen for quite a few reasons. The more obvious
  481. * ones are faults accessing the GDT, or LDT. Perhaps
  482. * surprisingly, if the CPU tries to deliver a benign or
  483. * contributory exception from user code and gets a page fault
  484. * during delivery, the page fault can be delivered as though
  485. * it originated directly from user code. This could happen
  486. * due to wrong permissions on the IDT, GDT, LDT, TSS, or
  487. * kernel or IST stack.
  488. */
  489. store_idt(&idt);
  490. /* Usable even on Xen PV -- it's just slow. */
  491. native_store_gdt(&gdt);
  492. pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
  493. idt.address, idt.size, gdt.address, gdt.size);
  494. store_ldt(ldtr);
  495. show_ldttss(&gdt, "LDTR", ldtr);
  496. store_tr(tr);
  497. show_ldttss(&gdt, "TR", tr);
  498. }
  499. dump_pagetable(address);
  500. if (error_code & X86_PF_RMP)
  501. snp_dump_hva_rmpentry(address);
  502. }
  503. static noinline void
  504. pgtable_bad(struct pt_regs *regs, unsigned long error_code,
  505. unsigned long address)
  506. {
  507. struct task_struct *tsk;
  508. unsigned long flags;
  509. int sig;
  510. flags = oops_begin();
  511. tsk = current;
  512. sig = SIGKILL;
  513. printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
  514. tsk->comm, address);
  515. dump_pagetable(address);
  516. if (__die("Bad pagetable", regs, error_code))
  517. sig = 0;
  518. oops_end(flags, regs, sig);
  519. }
  520. static void sanitize_error_code(unsigned long address,
  521. unsigned long *error_code)
  522. {
  523. /*
  524. * To avoid leaking information about the kernel page
  525. * table layout, pretend that user-mode accesses to
  526. * kernel addresses are always protection faults.
  527. *
  528. * NB: This means that failed vsyscalls with vsyscall=none
  529. * will have the PROT bit. This doesn't leak any
  530. * information and does not appear to cause any problems.
  531. */
  532. if (address >= TASK_SIZE_MAX)
  533. *error_code |= X86_PF_PROT;
  534. }
  535. static void set_signal_archinfo(unsigned long address,
  536. unsigned long error_code)
  537. {
  538. struct task_struct *tsk = current;
  539. tsk->thread.trap_nr = X86_TRAP_PF;
  540. tsk->thread.error_code = error_code | X86_PF_USER;
  541. tsk->thread.cr2 = address;
  542. }
  543. static noinline void
  544. page_fault_oops(struct pt_regs *regs, unsigned long error_code,
  545. unsigned long address)
  546. {
  547. #ifdef CONFIG_VMAP_STACK
  548. struct stack_info info;
  549. #endif
  550. unsigned long flags;
  551. int sig;
  552. if (user_mode(regs)) {
  553. /*
  554. * Implicit kernel access from user mode? Skip the stack
  555. * overflow and EFI special cases.
  556. */
  557. goto oops;
  558. }
  559. #ifdef CONFIG_VMAP_STACK
  560. /*
  561. * Stack overflow? During boot, we can fault near the initial
  562. * stack in the direct map, but that's not an overflow -- check
  563. * that we're in vmalloc space to avoid this.
  564. */
  565. if (is_vmalloc_addr((void *)address) &&
  566. get_stack_guard_info((void *)address, &info)) {
  567. /*
  568. * We're likely to be running with very little stack space
  569. * left. It's plausible that we'd hit this condition but
  570. * double-fault even before we get this far, in which case
  571. * we're fine: the double-fault handler will deal with it.
  572. *
  573. * We don't want to make it all the way into the oops code
  574. * and then double-fault, though, because we're likely to
  575. * break the console driver and lose most of the stack dump.
  576. */
  577. call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
  578. handle_stack_overflow,
  579. ASM_CALL_ARG3,
  580. , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
  581. BUG();
  582. }
  583. #endif
  584. /*
  585. * Buggy firmware could access regions which might page fault. If
  586. * this happens, EFI has a special OOPS path that will try to
  587. * avoid hanging the system.
  588. */
  589. if (IS_ENABLED(CONFIG_EFI))
  590. efi_crash_gracefully_on_page_fault(address);
  591. /* Only not-present faults should be handled by KFENCE. */
  592. if (!(error_code & X86_PF_PROT) &&
  593. kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
  594. return;
  595. oops:
  596. /*
  597. * Oops. The kernel tried to access some bad page. We'll have to
  598. * terminate things with extreme prejudice:
  599. */
  600. flags = oops_begin();
  601. show_fault_oops(regs, error_code, address);
  602. if (task_stack_end_corrupted(current))
  603. printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
  604. sig = SIGKILL;
  605. if (__die("Oops", regs, error_code))
  606. sig = 0;
  607. /* Executive summary in case the body of the oops scrolled away */
  608. printk(KERN_DEFAULT "CR2: %016lx\n", address);
  609. oops_end(flags, regs, sig);
  610. }
  611. static noinline void
  612. kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
  613. unsigned long address, int signal, int si_code,
  614. u32 pkey)
  615. {
  616. WARN_ON_ONCE(user_mode(regs));
  617. /* Are we prepared to handle this kernel fault? */
  618. if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
  619. return;
  620. /*
  621. * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
  622. * instruction.
  623. */
  624. if (is_prefetch(regs, error_code, address))
  625. return;
  626. page_fault_oops(regs, error_code, address);
  627. }
  628. /*
  629. * Print out info about fatal segfaults, if the show_unhandled_signals
  630. * sysctl is set:
  631. */
  632. static inline void
  633. show_signal_msg(struct pt_regs *regs, unsigned long error_code,
  634. unsigned long address, struct task_struct *tsk)
  635. {
  636. const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
  637. /* This is a racy snapshot, but it's better than nothing. */
  638. int cpu = raw_smp_processor_id();
  639. if (!unhandled_signal(tsk, SIGSEGV))
  640. return;
  641. if (!printk_ratelimit())
  642. return;
  643. printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
  644. loglvl, tsk->comm, task_pid_nr(tsk), address,
  645. (void *)regs->ip, (void *)regs->sp, error_code);
  646. print_vma_addr(KERN_CONT " in ", regs->ip);
  647. /*
  648. * Dump the likely CPU where the fatal segfault happened.
  649. * This can help identify faulty hardware.
  650. */
  651. printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
  652. topology_core_id(cpu), topology_physical_package_id(cpu));
  653. printk(KERN_CONT "\n");
  654. show_opcodes(regs, loglvl);
  655. }
  656. static void
  657. __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
  658. unsigned long address, u32 pkey, int si_code)
  659. {
  660. struct task_struct *tsk = current;
  661. if (!user_mode(regs)) {
  662. kernelmode_fixup_or_oops(regs, error_code, address,
  663. SIGSEGV, si_code, pkey);
  664. return;
  665. }
  666. if (!(error_code & X86_PF_USER)) {
  667. /* Implicit user access to kernel memory -- just oops */
  668. page_fault_oops(regs, error_code, address);
  669. return;
  670. }
  671. /*
  672. * User mode accesses just cause a SIGSEGV.
  673. * It's possible to have interrupts off here:
  674. */
  675. local_irq_enable();
  676. /*
  677. * Valid to do another page fault here because this one came
  678. * from user space:
  679. */
  680. if (is_prefetch(regs, error_code, address))
  681. return;
  682. if (is_errata100(regs, address))
  683. return;
  684. sanitize_error_code(address, &error_code);
  685. if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
  686. return;
  687. if (likely(show_unhandled_signals))
  688. show_signal_msg(regs, error_code, address, tsk);
  689. set_signal_archinfo(address, error_code);
  690. if (si_code == SEGV_PKUERR)
  691. force_sig_pkuerr((void __user *)address, pkey);
  692. else
  693. force_sig_fault(SIGSEGV, si_code, (void __user *)address);
  694. local_irq_disable();
  695. }
  696. static noinline void
  697. bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
  698. unsigned long address)
  699. {
  700. __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
  701. }
  702. static void
  703. __bad_area(struct pt_regs *regs, unsigned long error_code,
  704. unsigned long address, struct mm_struct *mm,
  705. struct vm_area_struct *vma, u32 pkey, int si_code)
  706. {
  707. /*
  708. * Something tried to access memory that isn't in our memory map..
  709. * Fix it, but check if it's kernel or user first..
  710. */
  711. if (mm)
  712. mmap_read_unlock(mm);
  713. else
  714. vma_end_read(vma);
  715. __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
  716. }
  717. static inline bool bad_area_access_from_pkeys(unsigned long error_code,
  718. struct vm_area_struct *vma)
  719. {
  720. /* This code is always called on the current mm */
  721. bool foreign = false;
  722. if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
  723. return false;
  724. if (error_code & X86_PF_PK)
  725. return true;
  726. /* this checks permission keys on the VMA: */
  727. if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
  728. (error_code & X86_PF_INSTR), foreign))
  729. return true;
  730. return false;
  731. }
  732. static noinline void
  733. bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
  734. unsigned long address, struct mm_struct *mm,
  735. struct vm_area_struct *vma)
  736. {
  737. /*
  738. * This OSPKE check is not strictly necessary at runtime.
  739. * But, doing it this way allows compiler optimizations
  740. * if pkeys are compiled out.
  741. */
  742. if (bad_area_access_from_pkeys(error_code, vma)) {
  743. /*
  744. * A protection key fault means that the PKRU value did not allow
  745. * access to some PTE. Userspace can figure out what PKRU was
  746. * from the XSAVE state. This function captures the pkey from
  747. * the vma and passes it to userspace so userspace can discover
  748. * which protection key was set on the PTE.
  749. *
  750. * If we get here, we know that the hardware signaled a X86_PF_PK
  751. * fault and that there was a VMA once we got in the fault
  752. * handler. It does *not* guarantee that the VMA we find here
  753. * was the one that we faulted on.
  754. *
  755. * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
  756. * 2. T1 : set PKRU to deny access to pkey=4, touches page
  757. * 3. T1 : faults...
  758. * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
  759. * 5. T1 : enters fault handler, takes mmap_lock, etc...
  760. * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
  761. * faulted on a pte with its pkey=4.
  762. */
  763. u32 pkey = vma_pkey(vma);
  764. __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
  765. } else {
  766. __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
  767. }
  768. }
  769. static void
  770. do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
  771. vm_fault_t fault)
  772. {
  773. /* Kernel mode? Handle exceptions or die: */
  774. if (!user_mode(regs)) {
  775. kernelmode_fixup_or_oops(regs, error_code, address,
  776. SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
  777. return;
  778. }
  779. /* User-space => ok to do another page fault: */
  780. if (is_prefetch(regs, error_code, address))
  781. return;
  782. sanitize_error_code(address, &error_code);
  783. if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
  784. return;
  785. set_signal_archinfo(address, error_code);
  786. #ifdef CONFIG_MEMORY_FAILURE
  787. if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
  788. struct task_struct *tsk = current;
  789. unsigned lsb = 0;
  790. pr_err(
  791. "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
  792. tsk->comm, tsk->pid, address);
  793. if (fault & VM_FAULT_HWPOISON_LARGE)
  794. lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
  795. if (fault & VM_FAULT_HWPOISON)
  796. lsb = PAGE_SHIFT;
  797. force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
  798. return;
  799. }
  800. #endif
  801. force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
  802. }
  803. static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
  804. {
  805. if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
  806. return 0;
  807. if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
  808. return 0;
  809. return 1;
  810. }
  811. /*
  812. * Handle a spurious fault caused by a stale TLB entry.
  813. *
  814. * This allows us to lazily refresh the TLB when increasing the
  815. * permissions of a kernel page (RO -> RW or NX -> X). Doing it
  816. * eagerly is very expensive since that implies doing a full
  817. * cross-processor TLB flush, even if no stale TLB entries exist
  818. * on other processors.
  819. *
  820. * Spurious faults may only occur if the TLB contains an entry with
  821. * fewer permission than the page table entry. Non-present (P = 0)
  822. * and reserved bit (R = 1) faults are never spurious.
  823. *
  824. * There are no security implications to leaving a stale TLB when
  825. * increasing the permissions on a page.
  826. *
  827. * Returns non-zero if a spurious fault was handled, zero otherwise.
  828. *
  829. * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
  830. * (Optional Invalidation).
  831. */
  832. static noinline int
  833. spurious_kernel_fault(unsigned long error_code, unsigned long address)
  834. {
  835. pgd_t *pgd;
  836. p4d_t *p4d;
  837. pud_t *pud;
  838. pmd_t *pmd;
  839. pte_t *pte;
  840. int ret;
  841. /*
  842. * Only writes to RO or instruction fetches from NX may cause
  843. * spurious faults.
  844. *
  845. * These could be from user or supervisor accesses but the TLB
  846. * is only lazily flushed after a kernel mapping protection
  847. * change, so user accesses are not expected to cause spurious
  848. * faults.
  849. */
  850. if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
  851. error_code != (X86_PF_INSTR | X86_PF_PROT))
  852. return 0;
  853. pgd = init_mm.pgd + pgd_index(address);
  854. if (!pgd_present(*pgd))
  855. return 0;
  856. p4d = p4d_offset(pgd, address);
  857. if (!p4d_present(*p4d))
  858. return 0;
  859. if (p4d_leaf(*p4d))
  860. return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
  861. pud = pud_offset(p4d, address);
  862. if (!pud_present(*pud))
  863. return 0;
  864. if (pud_leaf(*pud))
  865. return spurious_kernel_fault_check(error_code, (pte_t *) pud);
  866. pmd = pmd_offset(pud, address);
  867. if (!pmd_present(*pmd))
  868. return 0;
  869. if (pmd_leaf(*pmd))
  870. return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
  871. pte = pte_offset_kernel(pmd, address);
  872. if (!pte_present(*pte))
  873. return 0;
  874. ret = spurious_kernel_fault_check(error_code, pte);
  875. if (!ret)
  876. return 0;
  877. /*
  878. * Make sure we have permissions in PMD.
  879. * If not, then there's a bug in the page tables:
  880. */
  881. ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
  882. WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
  883. return ret;
  884. }
  885. NOKPROBE_SYMBOL(spurious_kernel_fault);
  886. int show_unhandled_signals = 1;
  887. static inline int
  888. access_error(unsigned long error_code, struct vm_area_struct *vma)
  889. {
  890. /* This is only called for the current mm, so: */
  891. bool foreign = false;
  892. /*
  893. * Read or write was blocked by protection keys. This is
  894. * always an unconditional error and can never result in
  895. * a follow-up action to resolve the fault, like a COW.
  896. */
  897. if (error_code & X86_PF_PK)
  898. return 1;
  899. /*
  900. * SGX hardware blocked the access. This usually happens
  901. * when the enclave memory contents have been destroyed, like
  902. * after a suspend/resume cycle. In any case, the kernel can't
  903. * fix the cause of the fault. Handle the fault as an access
  904. * error even in cases where no actual access violation
  905. * occurred. This allows userspace to rebuild the enclave in
  906. * response to the signal.
  907. */
  908. if (unlikely(error_code & X86_PF_SGX))
  909. return 1;
  910. /*
  911. * Make sure to check the VMA so that we do not perform
  912. * faults just to hit a X86_PF_PK as soon as we fill in a
  913. * page.
  914. */
  915. if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
  916. (error_code & X86_PF_INSTR), foreign))
  917. return 1;
  918. /*
  919. * Shadow stack accesses (PF_SHSTK=1) are only permitted to
  920. * shadow stack VMAs. All other accesses result in an error.
  921. */
  922. if (error_code & X86_PF_SHSTK) {
  923. if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
  924. return 1;
  925. if (unlikely(!(vma->vm_flags & VM_WRITE)))
  926. return 1;
  927. return 0;
  928. }
  929. if (error_code & X86_PF_WRITE) {
  930. /* write, present and write, not present: */
  931. if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
  932. return 1;
  933. if (unlikely(!(vma->vm_flags & VM_WRITE)))
  934. return 1;
  935. return 0;
  936. }
  937. /* read, present: */
  938. if (unlikely(error_code & X86_PF_PROT))
  939. return 1;
  940. /* read, not present: */
  941. if (unlikely(!vma_is_accessible(vma)))
  942. return 1;
  943. return 0;
  944. }
  945. bool fault_in_kernel_space(unsigned long address)
  946. {
  947. /*
  948. * On 64-bit systems, the vsyscall page is at an address above
  949. * TASK_SIZE_MAX, but is not considered part of the kernel
  950. * address space.
  951. */
  952. if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
  953. return false;
  954. return address >= TASK_SIZE_MAX;
  955. }
  956. /*
  957. * Called for all faults where 'address' is part of the kernel address
  958. * space. Might get called for faults that originate from *code* that
  959. * ran in userspace or the kernel.
  960. */
  961. static void
  962. do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
  963. unsigned long address)
  964. {
  965. /*
  966. * Protection keys exceptions only happen on user pages. We
  967. * have no user pages in the kernel portion of the address
  968. * space, so do not expect them here.
  969. */
  970. WARN_ON_ONCE(hw_error_code & X86_PF_PK);
  971. #ifdef CONFIG_X86_32
  972. /*
  973. * We can fault-in kernel-space virtual memory on-demand. The
  974. * 'reference' page table is init_mm.pgd.
  975. *
  976. * NOTE! We MUST NOT take any locks for this case. We may
  977. * be in an interrupt or a critical region, and should
  978. * only copy the information from the master page table,
  979. * nothing more.
  980. *
  981. * Before doing this on-demand faulting, ensure that the
  982. * fault is not any of the following:
  983. * 1. A fault on a PTE with a reserved bit set.
  984. * 2. A fault caused by a user-mode access. (Do not demand-
  985. * fault kernel memory due to user-mode accesses).
  986. * 3. A fault caused by a page-level protection violation.
  987. * (A demand fault would be on a non-present page which
  988. * would have X86_PF_PROT==0).
  989. *
  990. * This is only needed to close a race condition on x86-32 in
  991. * the vmalloc mapping/unmapping code. See the comment above
  992. * vmalloc_fault() for details. On x86-64 the race does not
  993. * exist as the vmalloc mappings don't need to be synchronized
  994. * there.
  995. */
  996. if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
  997. if (vmalloc_fault(address) >= 0)
  998. return;
  999. }
  1000. #endif
  1001. if (is_f00f_bug(regs, hw_error_code, address))
  1002. return;
  1003. /* Was the fault spurious, caused by lazy TLB invalidation? */
  1004. if (spurious_kernel_fault(hw_error_code, address))
  1005. return;
  1006. /* kprobes don't want to hook the spurious faults: */
  1007. if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
  1008. return;
  1009. /*
  1010. * Note, despite being a "bad area", there are quite a few
  1011. * acceptable reasons to get here, such as erratum fixups
  1012. * and handling kernel code that can fault, like get_user().
  1013. *
  1014. * Don't take the mm semaphore here. If we fixup a prefetch
  1015. * fault we could otherwise deadlock:
  1016. */
  1017. bad_area_nosemaphore(regs, hw_error_code, address);
  1018. }
  1019. NOKPROBE_SYMBOL(do_kern_addr_fault);
  1020. /*
  1021. * Handle faults in the user portion of the address space. Nothing in here
  1022. * should check X86_PF_USER without a specific justification: for almost
  1023. * all purposes, we should treat a normal kernel access to user memory
  1024. * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
  1025. * The one exception is AC flag handling, which is, per the x86
  1026. * architecture, special for WRUSS.
  1027. */
  1028. static inline
  1029. void do_user_addr_fault(struct pt_regs *regs,
  1030. unsigned long error_code,
  1031. unsigned long address)
  1032. {
  1033. struct vm_area_struct *vma;
  1034. struct task_struct *tsk;
  1035. struct mm_struct *mm;
  1036. vm_fault_t fault;
  1037. unsigned int flags = FAULT_FLAG_DEFAULT;
  1038. tsk = current;
  1039. mm = tsk->mm;
  1040. if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
  1041. /*
  1042. * Whoops, this is kernel mode code trying to execute from
  1043. * user memory. Unless this is AMD erratum #93, which
  1044. * corrupts RIP such that it looks like a user address,
  1045. * this is unrecoverable. Don't even try to look up the
  1046. * VMA or look for extable entries.
  1047. */
  1048. if (is_errata93(regs, address))
  1049. return;
  1050. page_fault_oops(regs, error_code, address);
  1051. return;
  1052. }
  1053. /* kprobes don't want to hook the spurious faults: */
  1054. if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
  1055. return;
  1056. /*
  1057. * Reserved bits are never expected to be set on
  1058. * entries in the user portion of the page tables.
  1059. */
  1060. if (unlikely(error_code & X86_PF_RSVD))
  1061. pgtable_bad(regs, error_code, address);
  1062. /*
  1063. * If SMAP is on, check for invalid kernel (supervisor) access to user
  1064. * pages in the user address space. The odd case here is WRUSS,
  1065. * which, according to the preliminary documentation, does not respect
  1066. * SMAP and will have the USER bit set so, in all cases, SMAP
  1067. * enforcement appears to be consistent with the USER bit.
  1068. */
  1069. if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
  1070. !(error_code & X86_PF_USER) &&
  1071. !(regs->flags & X86_EFLAGS_AC))) {
  1072. /*
  1073. * No extable entry here. This was a kernel access to an
  1074. * invalid pointer. get_kernel_nofault() will not get here.
  1075. */
  1076. page_fault_oops(regs, error_code, address);
  1077. return;
  1078. }
  1079. /*
  1080. * If we're in an interrupt, have no user context or are running
  1081. * in a region with pagefaults disabled then we must not take the fault
  1082. */
  1083. if (unlikely(faulthandler_disabled() || !mm)) {
  1084. bad_area_nosemaphore(regs, error_code, address);
  1085. return;
  1086. }
  1087. /* Legacy check - remove this after verifying that it doesn't trigger */
  1088. if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
  1089. bad_area_nosemaphore(regs, error_code, address);
  1090. return;
  1091. }
  1092. local_irq_enable();
  1093. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  1094. /*
  1095. * Read-only permissions can not be expressed in shadow stack PTEs.
  1096. * Treat all shadow stack accesses as WRITE faults. This ensures
  1097. * that the MM will prepare everything (e.g., break COW) such that
  1098. * maybe_mkwrite() can create a proper shadow stack PTE.
  1099. */
  1100. if (error_code & X86_PF_SHSTK)
  1101. flags |= FAULT_FLAG_WRITE;
  1102. if (error_code & X86_PF_WRITE)
  1103. flags |= FAULT_FLAG_WRITE;
  1104. if (error_code & X86_PF_INSTR)
  1105. flags |= FAULT_FLAG_INSTRUCTION;
  1106. /*
  1107. * We set FAULT_FLAG_USER based on the register state, not
  1108. * based on X86_PF_USER. User space accesses that cause
  1109. * system page faults are still user accesses.
  1110. */
  1111. if (user_mode(regs))
  1112. flags |= FAULT_FLAG_USER;
  1113. #ifdef CONFIG_X86_64
  1114. /*
  1115. * Faults in the vsyscall page might need emulation. The
  1116. * vsyscall page is at a high address (>PAGE_OFFSET), but is
  1117. * considered to be part of the user address space.
  1118. *
  1119. * The vsyscall page does not have a "real" VMA, so do this
  1120. * emulation before we go searching for VMAs.
  1121. *
  1122. * PKRU never rejects instruction fetches, so we don't need
  1123. * to consider the PF_PK bit.
  1124. */
  1125. if (is_vsyscall_vaddr(address)) {
  1126. if (emulate_vsyscall(error_code, regs, address))
  1127. return;
  1128. }
  1129. #endif
  1130. if (!(flags & FAULT_FLAG_USER))
  1131. goto lock_mmap;
  1132. vma = lock_vma_under_rcu(mm, address);
  1133. if (!vma)
  1134. goto lock_mmap;
  1135. if (unlikely(access_error(error_code, vma))) {
  1136. bad_area_access_error(regs, error_code, address, NULL, vma);
  1137. count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
  1138. return;
  1139. }
  1140. fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
  1141. if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
  1142. vma_end_read(vma);
  1143. if (!(fault & VM_FAULT_RETRY)) {
  1144. count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
  1145. goto done;
  1146. }
  1147. count_vm_vma_lock_event(VMA_LOCK_RETRY);
  1148. if (fault & VM_FAULT_MAJOR)
  1149. flags |= FAULT_FLAG_TRIED;
  1150. /* Quick path to respond to signals */
  1151. if (fault_signal_pending(fault, regs)) {
  1152. if (!user_mode(regs))
  1153. kernelmode_fixup_or_oops(regs, error_code, address,
  1154. SIGBUS, BUS_ADRERR,
  1155. ARCH_DEFAULT_PKEY);
  1156. return;
  1157. }
  1158. lock_mmap:
  1159. retry:
  1160. vma = lock_mm_and_find_vma(mm, address, regs);
  1161. if (unlikely(!vma)) {
  1162. bad_area_nosemaphore(regs, error_code, address);
  1163. return;
  1164. }
  1165. /*
  1166. * Ok, we have a good vm_area for this memory access, so
  1167. * we can handle it..
  1168. */
  1169. if (unlikely(access_error(error_code, vma))) {
  1170. bad_area_access_error(regs, error_code, address, mm, vma);
  1171. return;
  1172. }
  1173. /*
  1174. * If for any reason at all we couldn't handle the fault,
  1175. * make sure we exit gracefully rather than endlessly redo
  1176. * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
  1177. * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
  1178. *
  1179. * Note that handle_userfault() may also release and reacquire mmap_lock
  1180. * (and not return with VM_FAULT_RETRY), when returning to userland to
  1181. * repeat the page fault later with a VM_FAULT_NOPAGE retval
  1182. * (potentially after handling any pending signal during the return to
  1183. * userland). The return to userland is identified whenever
  1184. * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
  1185. */
  1186. fault = handle_mm_fault(vma, address, flags, regs);
  1187. if (fault_signal_pending(fault, regs)) {
  1188. /*
  1189. * Quick path to respond to signals. The core mm code
  1190. * has unlocked the mm for us if we get here.
  1191. */
  1192. if (!user_mode(regs))
  1193. kernelmode_fixup_or_oops(regs, error_code, address,
  1194. SIGBUS, BUS_ADRERR,
  1195. ARCH_DEFAULT_PKEY);
  1196. return;
  1197. }
  1198. /* The fault is fully completed (including releasing mmap lock) */
  1199. if (fault & VM_FAULT_COMPLETED)
  1200. return;
  1201. /*
  1202. * If we need to retry the mmap_lock has already been released,
  1203. * and if there is a fatal signal pending there is no guarantee
  1204. * that we made any progress. Handle this case first.
  1205. */
  1206. if (unlikely(fault & VM_FAULT_RETRY)) {
  1207. flags |= FAULT_FLAG_TRIED;
  1208. goto retry;
  1209. }
  1210. mmap_read_unlock(mm);
  1211. done:
  1212. if (likely(!(fault & VM_FAULT_ERROR)))
  1213. return;
  1214. if (fatal_signal_pending(current) && !user_mode(regs)) {
  1215. kernelmode_fixup_or_oops(regs, error_code, address,
  1216. 0, 0, ARCH_DEFAULT_PKEY);
  1217. return;
  1218. }
  1219. if (fault & VM_FAULT_OOM) {
  1220. /* Kernel mode? Handle exceptions or die: */
  1221. if (!user_mode(regs)) {
  1222. kernelmode_fixup_or_oops(regs, error_code, address,
  1223. SIGSEGV, SEGV_MAPERR,
  1224. ARCH_DEFAULT_PKEY);
  1225. return;
  1226. }
  1227. /*
  1228. * We ran out of memory, call the OOM killer, and return the
  1229. * userspace (which will retry the fault, or kill us if we got
  1230. * oom-killed):
  1231. */
  1232. pagefault_out_of_memory();
  1233. } else {
  1234. if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
  1235. VM_FAULT_HWPOISON_LARGE))
  1236. do_sigbus(regs, error_code, address, fault);
  1237. else if (fault & VM_FAULT_SIGSEGV)
  1238. bad_area_nosemaphore(regs, error_code, address);
  1239. else
  1240. BUG();
  1241. }
  1242. }
  1243. NOKPROBE_SYMBOL(do_user_addr_fault);
  1244. static __always_inline void
  1245. trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
  1246. unsigned long address)
  1247. {
  1248. if (!trace_pagefault_enabled())
  1249. return;
  1250. if (user_mode(regs))
  1251. trace_page_fault_user(address, regs, error_code);
  1252. else
  1253. trace_page_fault_kernel(address, regs, error_code);
  1254. }
  1255. static __always_inline void
  1256. handle_page_fault(struct pt_regs *regs, unsigned long error_code,
  1257. unsigned long address)
  1258. {
  1259. trace_page_fault_entries(regs, error_code, address);
  1260. if (unlikely(kmmio_fault(regs, address)))
  1261. return;
  1262. /* Was the fault on kernel-controlled part of the address space? */
  1263. if (unlikely(fault_in_kernel_space(address))) {
  1264. do_kern_addr_fault(regs, error_code, address);
  1265. } else {
  1266. do_user_addr_fault(regs, error_code, address);
  1267. /*
  1268. * User address page fault handling might have reenabled
  1269. * interrupts. Fixing up all potential exit points of
  1270. * do_user_addr_fault() and its leaf functions is just not
  1271. * doable w/o creating an unholy mess or turning the code
  1272. * upside down.
  1273. */
  1274. local_irq_disable();
  1275. }
  1276. }
  1277. DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
  1278. {
  1279. irqentry_state_t state;
  1280. unsigned long address;
  1281. address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();
  1282. prefetchw(&current->mm->mmap_lock);
  1283. /*
  1284. * KVM uses #PF vector to deliver 'page not present' events to guests
  1285. * (asynchronous page fault mechanism). The event happens when a
  1286. * userspace task is trying to access some valid (from guest's point of
  1287. * view) memory which is not currently mapped by the host (e.g. the
  1288. * memory is swapped out). Note, the corresponding "page ready" event
  1289. * which is injected when the memory becomes available, is delivered via
  1290. * an interrupt mechanism and not a #PF exception
  1291. * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
  1292. *
  1293. * We are relying on the interrupted context being sane (valid RSP,
  1294. * relevant locks not held, etc.), which is fine as long as the
  1295. * interrupted context had IF=1. We are also relying on the KVM
  1296. * async pf type field and CR2 being read consistently instead of
  1297. * getting values from real and async page faults mixed up.
  1298. *
  1299. * Fingers crossed.
  1300. *
  1301. * The async #PF handling code takes care of idtentry handling
  1302. * itself.
  1303. */
  1304. if (kvm_handle_async_pf(regs, (u32)address))
  1305. return;
  1306. /*
  1307. * Entry handling for valid #PF from kernel mode is slightly
  1308. * different: RCU is already watching and ct_irq_enter() must not
  1309. * be invoked because a kernel fault on a user space address might
  1310. * sleep.
  1311. *
  1312. * In case the fault hit a RCU idle region the conditional entry
  1313. * code reenabled RCU to avoid subsequent wreckage which helps
  1314. * debuggability.
  1315. */
  1316. state = irqentry_enter(regs);
  1317. instrumentation_begin();
  1318. handle_page_fault(regs, error_code, address);
  1319. instrumentation_end();
  1320. irqentry_exit(regs, state);
  1321. }