extable.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/extable.h>
  3. #include <linux/uaccess.h>
  4. #include <linux/sched/debug.h>
  5. #include <linux/bitfield.h>
  6. #include <xen/xen.h>
  7. #include <asm/fpu/api.h>
  8. #include <asm/fred.h>
  9. #include <asm/sev.h>
  10. #include <asm/traps.h>
  11. #include <asm/kdebug.h>
  12. #include <asm/insn-eval.h>
  13. #include <asm/sgx.h>
  14. static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
  15. {
  16. int reg_offset = pt_regs_offset(regs, nr);
  17. static unsigned long __dummy;
  18. if (WARN_ON_ONCE(reg_offset < 0))
  19. return &__dummy;
  20. return (unsigned long *)((unsigned long)regs + reg_offset);
  21. }
  22. static inline unsigned long
  23. ex_fixup_addr(const struct exception_table_entry *x)
  24. {
  25. return (unsigned long)&x->fixup + x->fixup;
  26. }
  27. static bool ex_handler_default(const struct exception_table_entry *e,
  28. struct pt_regs *regs)
  29. {
  30. if (e->data & EX_FLAG_CLEAR_AX)
  31. regs->ax = 0;
  32. if (e->data & EX_FLAG_CLEAR_DX)
  33. regs->dx = 0;
  34. regs->ip = ex_fixup_addr(e);
  35. return true;
  36. }
  37. /*
  38. * This is the *very* rare case where we do a "load_unaligned_zeropad()"
  39. * and it's a page crosser into a non-existent page.
  40. *
  41. * This happens when we optimistically load a pathname a word-at-a-time
  42. * and the name is less than the full word and the next page is not
  43. * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
  44. *
  45. * NOTE! The faulting address is always a 'mov mem,reg' type instruction
  46. * of size 'long', and the exception fixup must always point to right
  47. * after the instruction.
  48. */
  49. static bool ex_handler_zeropad(const struct exception_table_entry *e,
  50. struct pt_regs *regs,
  51. unsigned long fault_addr)
  52. {
  53. struct insn insn;
  54. const unsigned long mask = sizeof(long) - 1;
  55. unsigned long offset, addr, next_ip, len;
  56. unsigned long *reg;
  57. next_ip = ex_fixup_addr(e);
  58. len = next_ip - regs->ip;
  59. if (len > MAX_INSN_SIZE)
  60. return false;
  61. if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN))
  62. return false;
  63. if (insn.length != len)
  64. return false;
  65. if (insn.opcode.bytes[0] != 0x8b)
  66. return false;
  67. if (insn.opnd_bytes != sizeof(long))
  68. return false;
  69. addr = (unsigned long) insn_get_addr_ref(&insn, regs);
  70. if (addr == ~0ul)
  71. return false;
  72. offset = addr & mask;
  73. addr = addr & ~mask;
  74. if (fault_addr != addr + sizeof(long))
  75. return false;
  76. reg = insn_get_modrm_reg_ptr(&insn, regs);
  77. if (!reg)
  78. return false;
  79. *reg = *(unsigned long *)addr >> (offset * 8);
  80. return ex_handler_default(e, regs);
  81. }
  82. static bool ex_handler_fault(const struct exception_table_entry *fixup,
  83. struct pt_regs *regs, int trapnr)
  84. {
  85. regs->ax = trapnr;
  86. return ex_handler_default(fixup, regs);
  87. }
  88. static bool ex_handler_sgx(const struct exception_table_entry *fixup,
  89. struct pt_regs *regs, int trapnr)
  90. {
  91. regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
  92. return ex_handler_default(fixup, regs);
  93. }
  94. /*
  95. * Handler for when we fail to restore a task's FPU state. We should never get
  96. * here because the FPU state of a task using the FPU (task->thread.fpu.state)
  97. * should always be valid. However, past bugs have allowed userspace to set
  98. * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
  99. * These caused XRSTOR to fail when switching to the task, leaking the FPU
  100. * registers of the task previously executing on the CPU. Mitigate this class
  101. * of vulnerability by restoring from the initial state (essentially, zeroing
  102. * out all the FPU registers) if we can't restore from the task's FPU state.
  103. */
  104. static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
  105. struct pt_regs *regs)
  106. {
  107. regs->ip = ex_fixup_addr(fixup);
  108. WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
  109. (void *)instruction_pointer(regs));
  110. fpu_reset_from_exception_fixup();
  111. return true;
  112. }
  113. /*
  114. * On x86-64, we end up being imprecise with 'access_ok()', and allow
  115. * non-canonical user addresses to make the range comparisons simpler,
  116. * and to not have to worry about LAM being enabled.
  117. *
  118. * In fact, we allow up to one page of "slop" at the sign boundary,
  119. * which means that we can do access_ok() by just checking the sign
  120. * of the pointer for the common case of having a small access size.
  121. */
  122. static bool gp_fault_address_ok(unsigned long fault_address)
  123. {
  124. #ifdef CONFIG_X86_64
  125. /* Is it in the "user space" part of the non-canonical space? */
  126. if (valid_user_address(fault_address))
  127. return true;
  128. /* .. or just above it? */
  129. fault_address -= PAGE_SIZE;
  130. if (valid_user_address(fault_address))
  131. return true;
  132. #endif
  133. return false;
  134. }
  135. static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
  136. struct pt_regs *regs, int trapnr,
  137. unsigned long fault_address)
  138. {
  139. WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
  140. "General protection fault in user access. Non-canonical address?");
  141. return ex_handler_default(fixup, regs);
  142. }
  143. static bool ex_handler_msr(const struct exception_table_entry *fixup,
  144. struct pt_regs *regs, bool wrmsr, bool safe, int reg)
  145. {
  146. if (__ONCE_LITE_IF(!safe && wrmsr)) {
  147. pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
  148. (unsigned int)regs->cx, (unsigned int)regs->dx,
  149. (unsigned int)regs->ax, regs->ip, (void *)regs->ip);
  150. show_stack_regs(regs);
  151. }
  152. if (__ONCE_LITE_IF(!safe && !wrmsr)) {
  153. pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
  154. (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
  155. show_stack_regs(regs);
  156. }
  157. if (!wrmsr) {
  158. /* Pretend that the read succeeded and returned 0. */
  159. regs->ax = 0;
  160. regs->dx = 0;
  161. }
  162. if (safe)
  163. *pt_regs_nr(regs, reg) = -EIO;
  164. return ex_handler_default(fixup, regs);
  165. }
  166. static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
  167. struct pt_regs *regs)
  168. {
  169. if (static_cpu_has(X86_BUG_NULL_SEG))
  170. asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
  171. asm volatile ("mov %0, %%fs" : : "rm" (0));
  172. return ex_handler_default(fixup, regs);
  173. }
  174. static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
  175. struct pt_regs *regs, int reg, int imm)
  176. {
  177. *pt_regs_nr(regs, reg) = (long)imm;
  178. return ex_handler_default(fixup, regs);
  179. }
  180. static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
  181. struct pt_regs *regs, int trapnr,
  182. unsigned long fault_address,
  183. int reg, int imm)
  184. {
  185. regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
  186. return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
  187. }
  188. #ifdef CONFIG_X86_FRED
  189. static bool ex_handler_eretu(const struct exception_table_entry *fixup,
  190. struct pt_regs *regs, unsigned long error_code)
  191. {
  192. struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax));
  193. unsigned short ss = uregs->ss;
  194. unsigned short cs = uregs->cs;
  195. /*
  196. * Move the NMI bit from the invalid stack frame, which caused ERETU
  197. * to fault, to the fault handler's stack frame, thus to unblock NMI
  198. * with the fault handler's ERETS instruction ASAP if NMI is blocked.
  199. */
  200. regs->fred_ss.nmi = uregs->fred_ss.nmi;
  201. /*
  202. * Sync event information to uregs, i.e., the ERETU return frame, but
  203. * is it safe to write to the ERETU return frame which is just above
  204. * current event stack frame?
  205. *
  206. * The RSP used by FRED to push a stack frame is not the value in %rsp,
  207. * it is calculated from %rsp with the following 2 steps:
  208. * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes
  209. * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line
  210. * when an event delivery doesn't trigger a stack level change.
  211. *
  212. * Here is an example with N*64 (N=1) bytes reserved:
  213. *
  214. * 64-byte cache line ==> ______________
  215. * |___Reserved___|
  216. * |__Event_data__|
  217. * |_____SS_______|
  218. * |_____RSP______|
  219. * |_____FLAGS____|
  220. * |_____CS_______|
  221. * |_____IP_______|
  222. * 64-byte cache line ==> |__Error_code__| <== ERETU return frame
  223. * |______________|
  224. * |______________|
  225. * |______________|
  226. * |______________|
  227. * |______________|
  228. * |______________|
  229. * |______________|
  230. * 64-byte cache line ==> |______________| <== RSP after step 1) and 2)
  231. * |___Reserved___|
  232. * |__Event_data__|
  233. * |_____SS_______|
  234. * |_____RSP______|
  235. * |_____FLAGS____|
  236. * |_____CS_______|
  237. * |_____IP_______|
  238. * 64-byte cache line ==> |__Error_code__| <== ERETS return frame
  239. *
  240. * Thus a new FRED stack frame will always be pushed below a previous
  241. * FRED stack frame ((N*64) bytes may be reserved between), and it is
  242. * safe to write to a previous FRED stack frame as they never overlap.
  243. */
  244. fred_info(uregs)->edata = fred_event_data(regs);
  245. uregs->ssx = regs->ssx;
  246. uregs->fred_ss.ss = ss;
  247. /* The NMI bit was moved away above */
  248. uregs->fred_ss.nmi = 0;
  249. uregs->csx = regs->csx;
  250. uregs->fred_cs.sl = 0;
  251. uregs->fred_cs.wfe = 0;
  252. uregs->cs = cs;
  253. uregs->orig_ax = error_code;
  254. return ex_handler_default(fixup, regs);
  255. }
  256. #endif
  257. int ex_get_fixup_type(unsigned long ip)
  258. {
  259. const struct exception_table_entry *e = search_exception_tables(ip);
  260. return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
  261. }
  262. int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
  263. unsigned long fault_addr)
  264. {
  265. const struct exception_table_entry *e;
  266. int type, reg, imm;
  267. #ifdef CONFIG_PNPBIOS
  268. if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
  269. extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
  270. extern u32 pnp_bios_is_utter_crap;
  271. pnp_bios_is_utter_crap = 1;
  272. printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
  273. __asm__ volatile(
  274. "movl %0, %%esp\n\t"
  275. "jmp *%1\n\t"
  276. : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
  277. panic("do_trap: can't hit this");
  278. }
  279. #endif
  280. e = search_exception_tables(regs->ip);
  281. if (!e)
  282. return 0;
  283. type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
  284. reg = FIELD_GET(EX_DATA_REG_MASK, e->data);
  285. imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
  286. switch (type) {
  287. case EX_TYPE_DEFAULT:
  288. case EX_TYPE_DEFAULT_MCE_SAFE:
  289. return ex_handler_default(e, regs);
  290. case EX_TYPE_FAULT:
  291. case EX_TYPE_FAULT_MCE_SAFE:
  292. return ex_handler_fault(e, regs, trapnr);
  293. case EX_TYPE_UACCESS:
  294. return ex_handler_uaccess(e, regs, trapnr, fault_addr);
  295. case EX_TYPE_CLEAR_FS:
  296. return ex_handler_clear_fs(e, regs);
  297. case EX_TYPE_FPU_RESTORE:
  298. return ex_handler_fprestore(e, regs);
  299. case EX_TYPE_BPF:
  300. return ex_handler_bpf(e, regs);
  301. case EX_TYPE_WRMSR:
  302. return ex_handler_msr(e, regs, true, false, reg);
  303. case EX_TYPE_RDMSR:
  304. return ex_handler_msr(e, regs, false, false, reg);
  305. case EX_TYPE_WRMSR_SAFE:
  306. return ex_handler_msr(e, regs, true, true, reg);
  307. case EX_TYPE_RDMSR_SAFE:
  308. return ex_handler_msr(e, regs, false, true, reg);
  309. case EX_TYPE_WRMSR_IN_MCE:
  310. ex_handler_msr_mce(regs, true);
  311. break;
  312. case EX_TYPE_RDMSR_IN_MCE:
  313. ex_handler_msr_mce(regs, false);
  314. break;
  315. case EX_TYPE_POP_REG:
  316. regs->sp += sizeof(long);
  317. fallthrough;
  318. case EX_TYPE_IMM_REG:
  319. return ex_handler_imm_reg(e, regs, reg, imm);
  320. case EX_TYPE_FAULT_SGX:
  321. return ex_handler_sgx(e, regs, trapnr);
  322. case EX_TYPE_UCOPY_LEN:
  323. return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
  324. case EX_TYPE_ZEROPAD:
  325. return ex_handler_zeropad(e, regs, fault_addr);
  326. #ifdef CONFIG_X86_FRED
  327. case EX_TYPE_ERETU:
  328. return ex_handler_eretu(e, regs, error_code);
  329. #endif
  330. }
  331. BUG();
  332. }
  333. extern unsigned int early_recursion_flag;
  334. /* Restricted version used during very early boot */
  335. void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
  336. {
  337. /* Ignore early NMIs. */
  338. if (trapnr == X86_TRAP_NMI)
  339. return;
  340. if (early_recursion_flag > 2)
  341. goto halt_loop;
  342. /*
  343. * Old CPUs leave the high bits of CS on the stack
  344. * undefined. I'm not sure which CPUs do this, but at least
  345. * the 486 DX works this way.
  346. * Xen pv domains are not using the default __KERNEL_CS.
  347. */
  348. if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
  349. goto fail;
  350. /*
  351. * The full exception fixup machinery is available as soon as
  352. * the early IDT is loaded. This means that it is the
  353. * responsibility of extable users to either function correctly
  354. * when handlers are invoked early or to simply avoid causing
  355. * exceptions before they're ready to handle them.
  356. *
  357. * This is better than filtering which handlers can be used,
  358. * because refusing to call a handler here is guaranteed to
  359. * result in a hard-to-debug panic.
  360. *
  361. * Keep in mind that not all vectors actually get here. Early
  362. * page faults, for example, are special.
  363. */
  364. if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
  365. return;
  366. if (trapnr == X86_TRAP_UD) {
  367. if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
  368. /* Skip the ud2. */
  369. regs->ip += LEN_UD2;
  370. return;
  371. }
  372. /*
  373. * If this was a BUG and report_bug returns or if this
  374. * was just a normal #UD, we want to continue onward and
  375. * crash.
  376. */
  377. }
  378. fail:
  379. early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
  380. (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
  381. regs->orig_ax, read_cr2());
  382. show_regs(regs);
  383. halt_loop:
  384. while (true)
  385. halt();
  386. }