fault.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Based on arch/arm/mm/fault.c
  4. *
  5. * Copyright (C) 1995 Linus Torvalds
  6. * Copyright (C) 1995-2004 Russell King
  7. * Copyright (C) 2012 ARM Ltd.
  8. */
  9. #include <linux/acpi.h>
  10. #include <linux/bitfield.h>
  11. #include <linux/extable.h>
  12. #include <linux/kfence.h>
  13. #include <linux/signal.h>
  14. #include <linux/mm.h>
  15. #include <linux/hardirq.h>
  16. #include <linux/init.h>
  17. #include <linux/kasan.h>
  18. #include <linux/kprobes.h>
  19. #include <linux/uaccess.h>
  20. #include <linux/page-flags.h>
  21. #include <linux/sched/signal.h>
  22. #include <linux/sched/debug.h>
  23. #include <linux/highmem.h>
  24. #include <linux/perf_event.h>
  25. #include <linux/pkeys.h>
  26. #include <linux/preempt.h>
  27. #include <linux/hugetlb.h>
  28. #include <asm/acpi.h>
  29. #include <asm/bug.h>
  30. #include <asm/cmpxchg.h>
  31. #include <asm/cpufeature.h>
  32. #include <asm/efi.h>
  33. #include <asm/exception.h>
  34. #include <asm/daifflags.h>
  35. #include <asm/debug-monitors.h>
  36. #include <asm/esr.h>
  37. #include <asm/kprobes.h>
  38. #include <asm/mte.h>
  39. #include <asm/processor.h>
  40. #include <asm/sysreg.h>
  41. #include <asm/system_misc.h>
  42. #include <asm/tlbflush.h>
  43. #include <asm/traps.h>
  44. struct fault_info {
  45. int (*fn)(unsigned long far, unsigned long esr,
  46. struct pt_regs *regs);
  47. int sig;
  48. int code;
  49. const char *name;
  50. };
  51. static const struct fault_info fault_info[];
  52. static struct fault_info debug_fault_info[];
  53. static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
  54. {
  55. return fault_info + (esr & ESR_ELx_FSC);
  56. }
  57. static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
  58. {
  59. return debug_fault_info + DBG_ESR_EVT(esr);
  60. }
  61. static void data_abort_decode(unsigned long esr)
  62. {
  63. unsigned long iss2 = ESR_ELx_ISS2(esr);
  64. pr_alert("Data abort info:\n");
  65. if (esr & ESR_ELx_ISV) {
  66. pr_alert(" Access size = %u byte(s)\n",
  67. 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
  68. pr_alert(" SSE = %lu, SRT = %lu\n",
  69. (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
  70. (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
  71. pr_alert(" SF = %lu, AR = %lu\n",
  72. (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
  73. (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
  74. } else {
  75. pr_alert(" ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
  76. esr & ESR_ELx_ISS_MASK, iss2);
  77. }
  78. pr_alert(" CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
  79. (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
  80. (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT,
  81. (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT,
  82. (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT);
  83. pr_alert(" GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
  84. (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT,
  85. (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT,
  86. (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT,
  87. (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT);
  88. }
  89. static void mem_abort_decode(unsigned long esr)
  90. {
  91. pr_alert("Mem abort info:\n");
  92. pr_alert(" ESR = 0x%016lx\n", esr);
  93. pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
  94. ESR_ELx_EC(esr), esr_get_class_string(esr),
  95. (esr & ESR_ELx_IL) ? 32 : 16);
  96. pr_alert(" SET = %lu, FnV = %lu\n",
  97. (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
  98. (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
  99. pr_alert(" EA = %lu, S1PTW = %lu\n",
  100. (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
  101. (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
  102. pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
  103. esr_to_fault_info(esr)->name);
  104. if (esr_is_data_abort(esr))
  105. data_abort_decode(esr);
  106. }
  107. static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
  108. {
  109. /* Either init_pg_dir or swapper_pg_dir */
  110. if (mm == &init_mm)
  111. return __pa_symbol(mm->pgd);
  112. return (unsigned long)virt_to_phys(mm->pgd);
  113. }
  114. /*
  115. * Dump out the page tables associated with 'addr' in the currently active mm.
  116. */
  117. static void show_pte(unsigned long addr)
  118. {
  119. struct mm_struct *mm;
  120. pgd_t *pgdp;
  121. pgd_t pgd;
  122. if (is_ttbr0_addr(addr)) {
  123. /* TTBR0 */
  124. mm = current->active_mm;
  125. if (mm == &init_mm) {
  126. pr_alert("[%016lx] user address but active_mm is swapper\n",
  127. addr);
  128. return;
  129. }
  130. } else if (is_ttbr1_addr(addr)) {
  131. /* TTBR1 */
  132. mm = &init_mm;
  133. } else {
  134. pr_alert("[%016lx] address between user and kernel address ranges\n",
  135. addr);
  136. return;
  137. }
  138. pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
  139. mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
  140. vabits_actual, mm_to_pgd_phys(mm));
  141. pgdp = pgd_offset(mm, addr);
  142. pgd = READ_ONCE(*pgdp);
  143. pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
  144. do {
  145. p4d_t *p4dp, p4d;
  146. pud_t *pudp, pud;
  147. pmd_t *pmdp, pmd;
  148. pte_t *ptep, pte;
  149. if (pgd_none(pgd) || pgd_bad(pgd))
  150. break;
  151. p4dp = p4d_offset(pgdp, addr);
  152. p4d = READ_ONCE(*p4dp);
  153. pr_cont(", p4d=%016llx", p4d_val(p4d));
  154. if (p4d_none(p4d) || p4d_bad(p4d))
  155. break;
  156. pudp = pud_offset(p4dp, addr);
  157. pud = READ_ONCE(*pudp);
  158. pr_cont(", pud=%016llx", pud_val(pud));
  159. if (pud_none(pud) || pud_bad(pud))
  160. break;
  161. pmdp = pmd_offset(pudp, addr);
  162. pmd = READ_ONCE(*pmdp);
  163. pr_cont(", pmd=%016llx", pmd_val(pmd));
  164. if (pmd_none(pmd) || pmd_bad(pmd))
  165. break;
  166. ptep = pte_offset_map(pmdp, addr);
  167. if (!ptep)
  168. break;
  169. pte = __ptep_get(ptep);
  170. pr_cont(", pte=%016llx", pte_val(pte));
  171. pte_unmap(ptep);
  172. } while(0);
  173. pr_cont("\n");
  174. }
  175. /*
  176. * This function sets the access flags (dirty, accessed), as well as write
  177. * permission, and only to a more permissive setting.
  178. *
  179. * It needs to cope with hardware update of the accessed/dirty state by other
  180. * agents in the system and can safely skip the __sync_icache_dcache() call as,
  181. * like __set_ptes(), the PTE is never changed from no-exec to exec here.
  182. *
  183. * Returns whether or not the PTE actually changed.
  184. */
  185. int __ptep_set_access_flags(struct vm_area_struct *vma,
  186. unsigned long address, pte_t *ptep,
  187. pte_t entry, int dirty)
  188. {
  189. pteval_t old_pteval, pteval;
  190. pte_t pte = __ptep_get(ptep);
  191. if (pte_same(pte, entry))
  192. return 0;
  193. /* only preserve the access flags and write permission */
  194. pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
  195. /*
  196. * Setting the flags must be done atomically to avoid racing with the
  197. * hardware update of the access/dirty state. The PTE_RDONLY bit must
  198. * be set to the most permissive (lowest value) of *ptep and entry
  199. * (calculated as: a & b == ~(~a | ~b)).
  200. */
  201. pte_val(entry) ^= PTE_RDONLY;
  202. pteval = pte_val(pte);
  203. do {
  204. old_pteval = pteval;
  205. pteval ^= PTE_RDONLY;
  206. pteval |= pte_val(entry);
  207. pteval ^= PTE_RDONLY;
  208. pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
  209. } while (pteval != old_pteval);
  210. /* Invalidate a stale read-only entry */
  211. if (dirty)
  212. flush_tlb_page(vma, address);
  213. return 1;
  214. }
  215. static bool is_el1_instruction_abort(unsigned long esr)
  216. {
  217. return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
  218. }
  219. static bool is_el1_data_abort(unsigned long esr)
  220. {
  221. return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
  222. }
  223. static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
  224. struct pt_regs *regs)
  225. {
  226. if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
  227. return false;
  228. if (esr_fsc_is_permission_fault(esr))
  229. return true;
  230. if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
  231. return esr_fsc_is_translation_fault(esr) &&
  232. (regs->pstate & PSR_PAN_BIT);
  233. return false;
  234. }
  235. static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
  236. unsigned long esr,
  237. struct pt_regs *regs)
  238. {
  239. unsigned long flags;
  240. u64 par, dfsc;
  241. if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr))
  242. return false;
  243. local_irq_save(flags);
  244. asm volatile("at s1e1r, %0" :: "r" (addr));
  245. isb();
  246. par = read_sysreg_par();
  247. local_irq_restore(flags);
  248. /*
  249. * If we now have a valid translation, treat the translation fault as
  250. * spurious.
  251. */
  252. if (!(par & SYS_PAR_EL1_F))
  253. return true;
  254. /*
  255. * If we got a different type of fault from the AT instruction,
  256. * treat the translation fault as spurious.
  257. */
  258. dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
  259. return !esr_fsc_is_translation_fault(dfsc);
  260. }
  261. static void die_kernel_fault(const char *msg, unsigned long addr,
  262. unsigned long esr, struct pt_regs *regs)
  263. {
  264. bust_spinlocks(1);
  265. pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
  266. addr);
  267. kasan_non_canonical_hook(addr);
  268. mem_abort_decode(esr);
  269. show_pte(addr);
  270. die("Oops", regs, esr);
  271. bust_spinlocks(0);
  272. make_task_dead(SIGKILL);
  273. }
  274. #ifdef CONFIG_KASAN_HW_TAGS
  275. static void report_tag_fault(unsigned long addr, unsigned long esr,
  276. struct pt_regs *regs)
  277. {
  278. /*
  279. * SAS bits aren't set for all faults reported in EL1, so we can't
  280. * find out access size.
  281. */
  282. bool is_write = !!(esr & ESR_ELx_WNR);
  283. kasan_report((void *)addr, 0, is_write, regs->pc);
  284. }
  285. #else
  286. /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
  287. static inline void report_tag_fault(unsigned long addr, unsigned long esr,
  288. struct pt_regs *regs) { }
  289. #endif
  290. static void do_tag_recovery(unsigned long addr, unsigned long esr,
  291. struct pt_regs *regs)
  292. {
  293. report_tag_fault(addr, esr, regs);
  294. /*
  295. * Disable MTE Tag Checking on the local CPU for the current EL.
  296. * It will be done lazily on the other CPUs when they will hit a
  297. * tag fault.
  298. */
  299. sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
  300. SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
  301. isb();
  302. }
  303. static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
  304. {
  305. unsigned long fsc = esr & ESR_ELx_FSC;
  306. if (!is_el1_data_abort(esr))
  307. return false;
  308. if (fsc == ESR_ELx_FSC_MTE)
  309. return true;
  310. return false;
  311. }
  312. static void __do_kernel_fault(unsigned long addr, unsigned long esr,
  313. struct pt_regs *regs)
  314. {
  315. const char *msg;
  316. /*
  317. * Are we prepared to handle this kernel fault?
  318. * We are almost certainly not prepared to handle instruction faults.
  319. */
  320. if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
  321. return;
  322. if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
  323. "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
  324. return;
  325. if (is_el1_mte_sync_tag_check_fault(esr)) {
  326. do_tag_recovery(addr, esr, regs);
  327. return;
  328. }
  329. if (is_el1_permission_fault(addr, esr, regs)) {
  330. if (esr & ESR_ELx_WNR)
  331. msg = "write to read-only memory";
  332. else if (is_el1_instruction_abort(esr))
  333. msg = "execute from non-executable memory";
  334. else
  335. msg = "read from unreadable memory";
  336. } else if (addr < PAGE_SIZE) {
  337. msg = "NULL pointer dereference";
  338. } else {
  339. if (esr_fsc_is_translation_fault(esr) &&
  340. kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
  341. return;
  342. msg = "paging request";
  343. }
  344. if (efi_runtime_fixup_exception(regs, msg))
  345. return;
  346. die_kernel_fault(msg, addr, esr, regs);
  347. }
  348. static void set_thread_esr(unsigned long address, unsigned long esr)
  349. {
  350. current->thread.fault_address = address;
  351. /*
  352. * If the faulting address is in the kernel, we must sanitize the ESR.
  353. * From userspace's point of view, kernel-only mappings don't exist
  354. * at all, so we report them as level 0 translation faults.
  355. * (This is not quite the way that "no mapping there at all" behaves:
  356. * an alignment fault not caused by the memory type would take
  357. * precedence over translation fault for a real access to empty
  358. * space. Unfortunately we can't easily distinguish "alignment fault
  359. * not caused by memory type" from "alignment fault caused by memory
  360. * type", so we ignore this wrinkle and just return the translation
  361. * fault.)
  362. */
  363. if (!is_ttbr0_addr(current->thread.fault_address)) {
  364. switch (ESR_ELx_EC(esr)) {
  365. case ESR_ELx_EC_DABT_LOW:
  366. /*
  367. * These bits provide only information about the
  368. * faulting instruction, which userspace knows already.
  369. * We explicitly clear bits which are architecturally
  370. * RES0 in case they are given meanings in future.
  371. * We always report the ESR as if the fault was taken
  372. * to EL1 and so ISV and the bits in ISS[23:14] are
  373. * clear. (In fact it always will be a fault to EL1.)
  374. */
  375. esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
  376. ESR_ELx_CM | ESR_ELx_WNR;
  377. esr |= ESR_ELx_FSC_FAULT;
  378. break;
  379. case ESR_ELx_EC_IABT_LOW:
  380. /*
  381. * Claim a level 0 translation fault.
  382. * All other bits are architecturally RES0 for faults
  383. * reported with that DFSC value, so we clear them.
  384. */
  385. esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
  386. esr |= ESR_ELx_FSC_FAULT;
  387. break;
  388. default:
  389. /*
  390. * This should never happen (entry.S only brings us
  391. * into this code for insn and data aborts from a lower
  392. * exception level). Fail safe by not providing an ESR
  393. * context record at all.
  394. */
  395. WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
  396. esr = 0;
  397. break;
  398. }
  399. }
  400. current->thread.fault_code = esr;
  401. }
  402. static void do_bad_area(unsigned long far, unsigned long esr,
  403. struct pt_regs *regs)
  404. {
  405. unsigned long addr = untagged_addr(far);
  406. /*
  407. * If we are in kernel mode at this point, we have no context to
  408. * handle this fault with.
  409. */
  410. if (user_mode(regs)) {
  411. const struct fault_info *inf = esr_to_fault_info(esr);
  412. set_thread_esr(addr, esr);
  413. arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
  414. } else {
  415. __do_kernel_fault(addr, esr, regs);
  416. }
  417. }
  418. static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
  419. unsigned int mm_flags)
  420. {
  421. unsigned long iss2 = ESR_ELx_ISS2(esr);
  422. if (!system_supports_poe())
  423. return false;
  424. if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
  425. return true;
  426. return !arch_vma_access_permitted(vma,
  427. mm_flags & FAULT_FLAG_WRITE,
  428. mm_flags & FAULT_FLAG_INSTRUCTION,
  429. false);
  430. }
  431. static bool is_el0_instruction_abort(unsigned long esr)
  432. {
  433. return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
  434. }
  435. /*
  436. * Note: not valid for EL1 DC IVAC, but we never use that such that it
  437. * should fault. EL0 cannot issue DC IVAC (undef).
  438. */
  439. static bool is_write_abort(unsigned long esr)
  440. {
  441. return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
  442. }
  443. static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
  444. struct pt_regs *regs)
  445. {
  446. const struct fault_info *inf;
  447. struct mm_struct *mm = current->mm;
  448. vm_fault_t fault;
  449. unsigned long vm_flags;
  450. unsigned int mm_flags = FAULT_FLAG_DEFAULT;
  451. unsigned long addr = untagged_addr(far);
  452. struct vm_area_struct *vma;
  453. int si_code;
  454. int pkey = -1;
  455. if (kprobe_page_fault(regs, esr))
  456. return 0;
  457. /*
  458. * If we're in an interrupt or have no user context, we must not take
  459. * the fault.
  460. */
  461. if (faulthandler_disabled() || !mm)
  462. goto no_context;
  463. if (user_mode(regs))
  464. mm_flags |= FAULT_FLAG_USER;
  465. /*
  466. * vm_flags tells us what bits we must have in vma->vm_flags
  467. * for the fault to be benign, __do_page_fault() would check
  468. * vma->vm_flags & vm_flags and returns an error if the
  469. * intersection is empty
  470. */
  471. if (is_el0_instruction_abort(esr)) {
  472. /* It was exec fault */
  473. vm_flags = VM_EXEC;
  474. mm_flags |= FAULT_FLAG_INSTRUCTION;
  475. } else if (is_write_abort(esr)) {
  476. /* It was write fault */
  477. vm_flags = VM_WRITE;
  478. mm_flags |= FAULT_FLAG_WRITE;
  479. } else {
  480. /* It was read fault */
  481. vm_flags = VM_READ;
  482. /* Write implies read */
  483. vm_flags |= VM_WRITE;
  484. /* If EPAN is absent then exec implies read */
  485. if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
  486. vm_flags |= VM_EXEC;
  487. }
  488. if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
  489. if (is_el1_instruction_abort(esr))
  490. die_kernel_fault("execution of user memory",
  491. addr, esr, regs);
  492. if (!search_exception_tables(regs->pc))
  493. die_kernel_fault("access to user memory outside uaccess routines",
  494. addr, esr, regs);
  495. }
  496. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
  497. if (!(mm_flags & FAULT_FLAG_USER))
  498. goto lock_mmap;
  499. vma = lock_vma_under_rcu(mm, addr);
  500. if (!vma)
  501. goto lock_mmap;
  502. if (!(vma->vm_flags & vm_flags)) {
  503. vma_end_read(vma);
  504. fault = 0;
  505. si_code = SEGV_ACCERR;
  506. count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
  507. goto bad_area;
  508. }
  509. if (fault_from_pkey(esr, vma, mm_flags)) {
  510. pkey = vma_pkey(vma);
  511. vma_end_read(vma);
  512. fault = 0;
  513. si_code = SEGV_PKUERR;
  514. count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
  515. goto bad_area;
  516. }
  517. fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
  518. if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
  519. vma_end_read(vma);
  520. if (!(fault & VM_FAULT_RETRY)) {
  521. count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
  522. goto done;
  523. }
  524. count_vm_vma_lock_event(VMA_LOCK_RETRY);
  525. if (fault & VM_FAULT_MAJOR)
  526. mm_flags |= FAULT_FLAG_TRIED;
  527. /* Quick path to respond to signals */
  528. if (fault_signal_pending(fault, regs)) {
  529. if (!user_mode(regs))
  530. goto no_context;
  531. return 0;
  532. }
  533. lock_mmap:
  534. retry:
  535. vma = lock_mm_and_find_vma(mm, addr, regs);
  536. if (unlikely(!vma)) {
  537. fault = 0;
  538. si_code = SEGV_MAPERR;
  539. goto bad_area;
  540. }
  541. if (!(vma->vm_flags & vm_flags)) {
  542. mmap_read_unlock(mm);
  543. fault = 0;
  544. si_code = SEGV_ACCERR;
  545. goto bad_area;
  546. }
  547. if (fault_from_pkey(esr, vma, mm_flags)) {
  548. pkey = vma_pkey(vma);
  549. mmap_read_unlock(mm);
  550. fault = 0;
  551. si_code = SEGV_PKUERR;
  552. goto bad_area;
  553. }
  554. fault = handle_mm_fault(vma, addr, mm_flags, regs);
  555. /* Quick path to respond to signals */
  556. if (fault_signal_pending(fault, regs)) {
  557. if (!user_mode(regs))
  558. goto no_context;
  559. return 0;
  560. }
  561. /* The fault is fully completed (including releasing mmap lock) */
  562. if (fault & VM_FAULT_COMPLETED)
  563. return 0;
  564. if (fault & VM_FAULT_RETRY) {
  565. mm_flags |= FAULT_FLAG_TRIED;
  566. goto retry;
  567. }
  568. mmap_read_unlock(mm);
  569. done:
  570. /* Handle the "normal" (no error) case first. */
  571. if (likely(!(fault & VM_FAULT_ERROR)))
  572. return 0;
  573. si_code = SEGV_MAPERR;
  574. bad_area:
  575. /*
  576. * If we are in kernel mode at this point, we have no context to
  577. * handle this fault with.
  578. */
  579. if (!user_mode(regs))
  580. goto no_context;
  581. if (fault & VM_FAULT_OOM) {
  582. /*
  583. * We ran out of memory, call the OOM killer, and return to
  584. * userspace (which will retry the fault, or kill us if we got
  585. * oom-killed).
  586. */
  587. pagefault_out_of_memory();
  588. return 0;
  589. }
  590. inf = esr_to_fault_info(esr);
  591. set_thread_esr(addr, esr);
  592. if (fault & VM_FAULT_SIGBUS) {
  593. /*
  594. * We had some memory, but were unable to successfully fix up
  595. * this page fault.
  596. */
  597. arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
  598. } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
  599. unsigned int lsb;
  600. lsb = PAGE_SHIFT;
  601. if (fault & VM_FAULT_HWPOISON_LARGE)
  602. lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
  603. arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
  604. } else {
  605. /*
  606. * The pkey value that we return to userspace can be different
  607. * from the pkey that caused the fault.
  608. *
  609. * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
  610. * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page
  611. * 3. T1 : faults...
  612. * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
  613. * 5. T1 : enters fault handler, takes mmap_lock, etc...
  614. * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
  615. * faulted on a pte with its pkey=4.
  616. */
  617. /* Something tried to access memory that out of memory map */
  618. if (si_code == SEGV_PKUERR)
  619. arm64_force_sig_fault_pkey(far, inf->name, pkey);
  620. else
  621. arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name);
  622. }
  623. return 0;
  624. no_context:
  625. __do_kernel_fault(addr, esr, regs);
  626. return 0;
  627. }
  628. static int __kprobes do_translation_fault(unsigned long far,
  629. unsigned long esr,
  630. struct pt_regs *regs)
  631. {
  632. unsigned long addr = untagged_addr(far);
  633. if (is_ttbr0_addr(addr))
  634. return do_page_fault(far, esr, regs);
  635. do_bad_area(far, esr, regs);
  636. return 0;
  637. }
  638. static int do_alignment_fault(unsigned long far, unsigned long esr,
  639. struct pt_regs *regs)
  640. {
  641. if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
  642. compat_user_mode(regs))
  643. return do_compat_alignment_fixup(far, regs);
  644. do_bad_area(far, esr, regs);
  645. return 0;
  646. }
  647. static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
  648. {
  649. return 1; /* "fault" */
  650. }
  651. static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
  652. {
  653. const struct fault_info *inf;
  654. unsigned long siaddr;
  655. inf = esr_to_fault_info(esr);
  656. if (user_mode(regs) && apei_claim_sea(regs) == 0) {
  657. /*
  658. * APEI claimed this as a firmware-first notification.
  659. * Some processing deferred to task_work before ret_to_user().
  660. */
  661. return 0;
  662. }
  663. if (esr & ESR_ELx_FnV) {
  664. siaddr = 0;
  665. } else {
  666. /*
  667. * The architecture specifies that the tag bits of FAR_EL1 are
  668. * UNKNOWN for synchronous external aborts. Mask them out now
  669. * so that userspace doesn't see them.
  670. */
  671. siaddr = untagged_addr(far);
  672. }
  673. arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
  674. return 0;
  675. }
  676. static int do_tag_check_fault(unsigned long far, unsigned long esr,
  677. struct pt_regs *regs)
  678. {
  679. /*
  680. * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
  681. * for tag check faults. Set them to corresponding bits in the untagged
  682. * address.
  683. */
  684. far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
  685. do_bad_area(far, esr, regs);
  686. return 0;
  687. }
  688. static const struct fault_info fault_info[] = {
  689. { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
  690. { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
  691. { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
  692. { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
  693. { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
  694. { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
  695. { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
  696. { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
  697. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" },
  698. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
  699. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
  700. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
  701. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" },
  702. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
  703. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
  704. { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
  705. { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
  706. { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
  707. { do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
  708. { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" },
  709. { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
  710. { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
  711. { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
  712. { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
  713. { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
  714. { do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
  715. { do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
  716. { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
  717. { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
  718. { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
  719. { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
  720. { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
  721. { do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
  722. { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
  723. { do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
  724. { do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
  725. { do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
  726. { do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
  727. { do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
  728. { do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
  729. { do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
  730. { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" },
  731. { do_bad, SIGKILL, SI_KERNEL, "unknown 42" },
  732. { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" },
  733. { do_bad, SIGKILL, SI_KERNEL, "unknown 44" },
  734. { do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
  735. { do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
  736. { do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
  737. { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
  738. { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
  739. { do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
  740. { do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
  741. { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" },
  742. { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" },
  743. { do_bad, SIGKILL, SI_KERNEL, "unknown 54" },
  744. { do_bad, SIGKILL, SI_KERNEL, "unknown 55" },
  745. { do_bad, SIGKILL, SI_KERNEL, "unknown 56" },
  746. { do_bad, SIGKILL, SI_KERNEL, "unknown 57" },
  747. { do_bad, SIGKILL, SI_KERNEL, "unknown 58" },
  748. { do_bad, SIGKILL, SI_KERNEL, "unknown 59" },
  749. { do_bad, SIGKILL, SI_KERNEL, "unknown 60" },
  750. { do_bad, SIGKILL, SI_KERNEL, "section domain fault" },
  751. { do_bad, SIGKILL, SI_KERNEL, "page domain fault" },
  752. { do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
  753. };
  754. void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
  755. {
  756. const struct fault_info *inf = esr_to_fault_info(esr);
  757. unsigned long addr = untagged_addr(far);
  758. if (!inf->fn(far, esr, regs))
  759. return;
  760. if (!user_mode(regs))
  761. die_kernel_fault(inf->name, addr, esr, regs);
  762. /*
  763. * At this point we have an unrecognized fault type whose tag bits may
  764. * have been defined as UNKNOWN. Therefore we only expose the untagged
  765. * address to the signal handler.
  766. */
  767. arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
  768. }
  769. NOKPROBE_SYMBOL(do_mem_abort);
  770. void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
  771. {
  772. arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
  773. addr, esr);
  774. }
  775. NOKPROBE_SYMBOL(do_sp_pc_abort);
  776. /*
  777. * __refdata because early_brk64 is __init, but the reference to it is
  778. * clobbered at arch_initcall time.
  779. * See traps.c and debug-monitors.c:debug_traps_init().
  780. */
  781. static struct fault_info __refdata debug_fault_info[] = {
  782. { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
  783. { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
  784. { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
  785. { do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
  786. { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
  787. { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
  788. { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
  789. { do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
  790. };
  791. void __init hook_debug_fault_code(int nr,
  792. int (*fn)(unsigned long, unsigned long, struct pt_regs *),
  793. int sig, int code, const char *name)
  794. {
  795. BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
  796. debug_fault_info[nr].fn = fn;
  797. debug_fault_info[nr].sig = sig;
  798. debug_fault_info[nr].code = code;
  799. debug_fault_info[nr].name = name;
  800. }
  801. /*
  802. * In debug exception context, we explicitly disable preemption despite
  803. * having interrupts disabled.
  804. * This serves two purposes: it makes it much less likely that we would
  805. * accidentally schedule in exception context and it will force a warning
  806. * if we somehow manage to schedule by accident.
  807. */
  808. static void debug_exception_enter(struct pt_regs *regs)
  809. {
  810. preempt_disable();
  811. /* This code is a bit fragile. Test it. */
  812. RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
  813. }
  814. NOKPROBE_SYMBOL(debug_exception_enter);
  815. static void debug_exception_exit(struct pt_regs *regs)
  816. {
  817. preempt_enable_no_resched();
  818. }
  819. NOKPROBE_SYMBOL(debug_exception_exit);
  820. void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
  821. struct pt_regs *regs)
  822. {
  823. const struct fault_info *inf = esr_to_debug_fault_info(esr);
  824. unsigned long pc = instruction_pointer(regs);
  825. debug_exception_enter(regs);
  826. if (user_mode(regs) && !is_ttbr0_addr(pc))
  827. arm64_apply_bp_hardening();
  828. if (inf->fn(addr_if_watchpoint, esr, regs)) {
  829. arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
  830. }
  831. debug_exception_exit(regs);
  832. }
  833. NOKPROBE_SYMBOL(do_debug_exception);
  834. /*
  835. * Used during anonymous page fault handling.
  836. */
  837. struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
  838. unsigned long vaddr)
  839. {
  840. gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
  841. /*
  842. * If the page is mapped with PROT_MTE, initialise the tags at the
  843. * point of allocation and page zeroing as this is usually faster than
  844. * separate DC ZVA and STGM.
  845. */
  846. if (vma->vm_flags & VM_MTE)
  847. flags |= __GFP_ZEROTAGS;
  848. return vma_alloc_folio(flags, 0, vma, vaddr, false);
  849. }
  850. void tag_clear_highpage(struct page *page)
  851. {
  852. /* Newly allocated page, shouldn't have been tagged yet */
  853. WARN_ON_ONCE(!try_page_mte_tagging(page));
  854. mte_zero_clear_page_tags(page_address(page));
  855. set_page_mte_tagged(page);
  856. }