core.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Kernel Probes (KProbes)
  4. *
  5. * Copyright (C) IBM Corporation, 2002, 2004
  6. *
  7. * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
  8. * Probes initial implementation ( includes contributions from
  9. * Rusty Russell).
  10. * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  11. * interface to access function arguments.
  12. * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  13. * <prasanna@in.ibm.com> adapted for x86_64 from i386.
  14. * 2005-Mar Roland McGrath <roland@redhat.com>
  15. * Fixed to handle %rip-relative addressing mode correctly.
  16. * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
  17. * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  18. * <prasanna@in.ibm.com> added function-return probes.
  19. * 2005-May Rusty Lynch <rusty.lynch@intel.com>
  20. * Added function return probes functionality
  21. * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
  22. * kprobe-booster and kretprobe-booster for i386.
  23. * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
  24. * and kretprobe-booster for x86-64
  25. * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
  26. * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
  27. * unified x86 kprobes code.
  28. */
  29. #include <linux/kprobes.h>
  30. #include <linux/ptrace.h>
  31. #include <linux/string.h>
  32. #include <linux/slab.h>
  33. #include <linux/hardirq.h>
  34. #include <linux/preempt.h>
  35. #include <linux/sched/debug.h>
  36. #include <linux/perf_event.h>
  37. #include <linux/extable.h>
  38. #include <linux/kdebug.h>
  39. #include <linux/kallsyms.h>
  40. #include <linux/kgdb.h>
  41. #include <linux/ftrace.h>
  42. #include <linux/kasan.h>
  43. #include <linux/objtool.h>
  44. #include <linux/vmalloc.h>
  45. #include <linux/pgtable.h>
  46. #include <linux/set_memory.h>
  47. #include <linux/cfi.h>
  48. #include <linux/execmem.h>
  49. #include <asm/text-patching.h>
  50. #include <asm/cacheflush.h>
  51. #include <asm/desc.h>
  52. #include <linux/uaccess.h>
  53. #include <asm/alternative.h>
  54. #include <asm/insn.h>
  55. #include <asm/debugreg.h>
  56. #include <asm/ibt.h>
  57. #include "common.h"
  58. DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
  59. DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
  60. #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
  61. (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
  62. (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
  63. (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
  64. (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
  65. << (row % 32))
  66. /*
  67. * Undefined/reserved opcodes, conditional jump, Opcode Extension
  68. * Groups, and some special opcodes can not boost.
  69. * This is non-const and volatile to keep gcc from statically
  70. * optimizing it out, as variable_test_bit makes gcc think only
  71. * *(unsigned long*) is used.
  72. */
  73. static volatile u32 twobyte_is_boostable[256 / 32] = {
  74. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  75. /* ---------------------------------------------- */
  76. W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
  77. W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
  78. W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
  79. W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
  80. W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
  81. W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
  82. W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
  83. W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
  84. W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
  85. W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  86. W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
  87. W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
  88. W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
  89. W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
  90. W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
  91. W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
  92. /* ----------------------------------------------- */
  93. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  94. };
  95. #undef W
  96. struct kretprobe_blackpoint kretprobe_blacklist[] = {
  97. {"__switch_to", }, /* This function switches only current task, but
  98. doesn't switch kernel stack.*/
  99. {NULL, NULL} /* Terminator */
  100. };
  101. const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
  102. static nokprobe_inline void
  103. __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
  104. {
  105. struct __arch_relative_insn {
  106. u8 op;
  107. s32 raddr;
  108. } __packed *insn;
  109. insn = (struct __arch_relative_insn *)dest;
  110. insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
  111. insn->op = op;
  112. }
  113. /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
  114. void synthesize_reljump(void *dest, void *from, void *to)
  115. {
  116. __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
  117. }
  118. NOKPROBE_SYMBOL(synthesize_reljump);
  119. /* Insert a call instruction at address 'from', which calls address 'to'.*/
  120. void synthesize_relcall(void *dest, void *from, void *to)
  121. {
  122. __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
  123. }
  124. NOKPROBE_SYMBOL(synthesize_relcall);
  125. /*
  126. * Returns non-zero if INSN is boostable.
  127. * RIP relative instructions are adjusted at copying time in 64 bits mode
  128. */
  129. bool can_boost(struct insn *insn, void *addr)
  130. {
  131. kprobe_opcode_t opcode;
  132. insn_byte_t prefix;
  133. int i;
  134. if (search_exception_tables((unsigned long)addr))
  135. return false; /* Page fault may occur on this address. */
  136. /* 2nd-byte opcode */
  137. if (insn->opcode.nbytes == 2)
  138. return test_bit(insn->opcode.bytes[1],
  139. (unsigned long *)twobyte_is_boostable);
  140. if (insn->opcode.nbytes != 1)
  141. return false;
  142. for_each_insn_prefix(insn, i, prefix) {
  143. insn_attr_t attr;
  144. attr = inat_get_opcode_attribute(prefix);
  145. /* Can't boost Address-size override prefix and CS override prefix */
  146. if (prefix == 0x2e || inat_is_address_size_prefix(attr))
  147. return false;
  148. }
  149. opcode = insn->opcode.bytes[0];
  150. switch (opcode) {
  151. case 0x62: /* bound */
  152. case 0x70 ... 0x7f: /* Conditional jumps */
  153. case 0x9a: /* Call far */
  154. case 0xcc ... 0xce: /* software exceptions */
  155. case 0xd6: /* (UD) */
  156. case 0xd8 ... 0xdf: /* ESC */
  157. case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
  158. case 0xe8 ... 0xe9: /* near Call, JMP */
  159. case 0xeb: /* Short JMP */
  160. case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
  161. /* ... are not boostable */
  162. return false;
  163. case 0xc0 ... 0xc1: /* Grp2 */
  164. case 0xd0 ... 0xd3: /* Grp2 */
  165. /*
  166. * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved.
  167. */
  168. return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110;
  169. case 0xf6 ... 0xf7: /* Grp3 */
  170. /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */
  171. return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001;
  172. case 0xfe: /* Grp4 */
  173. /* Only INC and DEC are boostable */
  174. return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
  175. X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001;
  176. case 0xff: /* Grp5 */
  177. /* Only INC, DEC, and indirect JMP are boostable */
  178. return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
  179. X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 ||
  180. X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100;
  181. default:
  182. return true;
  183. }
  184. }
  185. static unsigned long
  186. __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  187. {
  188. struct kprobe *kp;
  189. bool faddr;
  190. kp = get_kprobe((void *)addr);
  191. faddr = ftrace_location(addr) == addr;
  192. /*
  193. * Use the current code if it is not modified by Kprobe
  194. * and it cannot be modified by ftrace.
  195. */
  196. if (!kp && !faddr)
  197. return addr;
  198. /*
  199. * Basically, kp->ainsn.insn has an original instruction.
  200. * However, RIP-relative instruction can not do single-stepping
  201. * at different place, __copy_instruction() tweaks the displacement of
  202. * that instruction. In that case, we can't recover the instruction
  203. * from the kp->ainsn.insn.
  204. *
  205. * On the other hand, in case on normal Kprobe, kp->opcode has a copy
  206. * of the first byte of the probed instruction, which is overwritten
  207. * by int3. And the instruction at kp->addr is not modified by kprobes
  208. * except for the first byte, we can recover the original instruction
  209. * from it and kp->opcode.
  210. *
  211. * In case of Kprobes using ftrace, we do not have a copy of
  212. * the original instruction. In fact, the ftrace location might
  213. * be modified at anytime and even could be in an inconsistent state.
  214. * Fortunately, we know that the original code is the ideal 5-byte
  215. * long NOP.
  216. */
  217. if (copy_from_kernel_nofault(buf, (void *)addr,
  218. MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
  219. return 0UL;
  220. if (faddr)
  221. memcpy(buf, x86_nops[5], 5);
  222. else
  223. buf[0] = kp->opcode;
  224. return (unsigned long)buf;
  225. }
  226. /*
  227. * Recover the probed instruction at addr for further analysis.
  228. * Caller must lock kprobes by kprobe_mutex, or disable preemption
  229. * for preventing to release referencing kprobes.
  230. * Returns zero if the instruction can not get recovered (or access failed).
  231. */
  232. unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
  233. {
  234. unsigned long __addr;
  235. __addr = __recover_optprobed_insn(buf, addr);
  236. if (__addr != addr)
  237. return __addr;
  238. return __recover_probed_insn(buf, addr);
  239. }
  240. /* Check if insn is INT or UD */
  241. static inline bool is_exception_insn(struct insn *insn)
  242. {
  243. /* UD uses 0f escape */
  244. if (insn->opcode.bytes[0] == 0x0f) {
  245. /* UD0 / UD1 / UD2 */
  246. return insn->opcode.bytes[1] == 0xff ||
  247. insn->opcode.bytes[1] == 0xb9 ||
  248. insn->opcode.bytes[1] == 0x0b;
  249. }
  250. /* INT3 / INT n / INTO / INT1 */
  251. return insn->opcode.bytes[0] == 0xcc ||
  252. insn->opcode.bytes[0] == 0xcd ||
  253. insn->opcode.bytes[0] == 0xce ||
  254. insn->opcode.bytes[0] == 0xf1;
  255. }
  256. /*
  257. * Check if paddr is at an instruction boundary and that instruction can
  258. * be probed
  259. */
  260. static bool can_probe(unsigned long paddr)
  261. {
  262. unsigned long addr, __addr, offset = 0;
  263. struct insn insn;
  264. kprobe_opcode_t buf[MAX_INSN_SIZE];
  265. if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
  266. return false;
  267. /* Decode instructions */
  268. addr = paddr - offset;
  269. while (addr < paddr) {
  270. /*
  271. * Check if the instruction has been modified by another
  272. * kprobe, in which case we replace the breakpoint by the
  273. * original instruction in our buffer.
  274. * Also, jump optimization will change the breakpoint to
  275. * relative-jump. Since the relative-jump itself is
  276. * normally used, we just go through if there is no kprobe.
  277. */
  278. __addr = recover_probed_instruction(buf, addr);
  279. if (!__addr)
  280. return false;
  281. if (insn_decode_kernel(&insn, (void *)__addr) < 0)
  282. return false;
  283. #ifdef CONFIG_KGDB
  284. /*
  285. * If there is a dynamically installed kgdb sw breakpoint,
  286. * this function should not be probed.
  287. */
  288. if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
  289. kgdb_has_hit_break(addr))
  290. return false;
  291. #endif
  292. addr += insn.length;
  293. }
  294. /* Check if paddr is at an instruction boundary */
  295. if (addr != paddr)
  296. return false;
  297. __addr = recover_probed_instruction(buf, addr);
  298. if (!__addr)
  299. return false;
  300. if (insn_decode_kernel(&insn, (void *)__addr) < 0)
  301. return false;
  302. /* INT and UD are special and should not be kprobed */
  303. if (is_exception_insn(&insn))
  304. return false;
  305. if (IS_ENABLED(CONFIG_CFI_CLANG)) {
  306. /*
  307. * The compiler generates the following instruction sequence
  308. * for indirect call checks and cfi.c decodes this;
  309. *
  310. *  movl -<id>, %r10d ; 6 bytes
  311. * addl -4(%reg), %r10d ; 4 bytes
  312. * je .Ltmp1 ; 2 bytes
  313. * ud2 ; <- regs->ip
  314. * .Ltmp1:
  315. *
  316. * Also, these movl and addl are used for showing expected
  317. * type. So those must not be touched.
  318. */
  319. if (insn.opcode.value == 0xBA)
  320. offset = 12;
  321. else if (insn.opcode.value == 0x3)
  322. offset = 6;
  323. else
  324. goto out;
  325. /* This movl/addl is used for decoding CFI. */
  326. if (is_cfi_trap(addr + offset))
  327. return false;
  328. }
  329. out:
  330. return true;
  331. }
  332. /* If x86 supports IBT (ENDBR) it must be skipped. */
  333. kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
  334. bool *on_func_entry)
  335. {
  336. u32 insn;
  337. /*
  338. * Since 'addr' is not guaranteed to be safe to access, use
  339. * copy_from_kernel_nofault() to read the instruction:
  340. */
  341. if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
  342. return NULL;
  343. if (is_endbr(insn)) {
  344. *on_func_entry = !offset || offset == 4;
  345. if (*on_func_entry)
  346. offset = 4;
  347. } else {
  348. *on_func_entry = !offset;
  349. }
  350. return (kprobe_opcode_t *)(addr + offset);
  351. }
  352. /*
  353. * Copy an instruction with recovering modified instruction by kprobes
  354. * and adjust the displacement if the instruction uses the %rip-relative
  355. * addressing mode. Note that since @real will be the final place of copied
  356. * instruction, displacement must be adjust by @real, not @dest.
  357. * This returns the length of copied instruction, or 0 if it has an error.
  358. */
  359. int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
  360. {
  361. kprobe_opcode_t buf[MAX_INSN_SIZE];
  362. unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
  363. int ret;
  364. if (!recovered_insn || !insn)
  365. return 0;
  366. /* This can access kernel text if given address is not recovered */
  367. if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
  368. MAX_INSN_SIZE))
  369. return 0;
  370. ret = insn_decode_kernel(insn, dest);
  371. if (ret < 0)
  372. return 0;
  373. /* We can not probe force emulate prefixed instruction */
  374. if (insn_has_emulate_prefix(insn))
  375. return 0;
  376. /* Another subsystem puts a breakpoint, failed to recover */
  377. if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
  378. return 0;
  379. /* We should not singlestep on the exception masking instructions */
  380. if (insn_masking_exception(insn))
  381. return 0;
  382. #ifdef CONFIG_X86_64
  383. /* Only x86_64 has RIP relative instructions */
  384. if (insn_rip_relative(insn)) {
  385. s64 newdisp;
  386. u8 *disp;
  387. /*
  388. * The copied instruction uses the %rip-relative addressing
  389. * mode. Adjust the displacement for the difference between
  390. * the original location of this instruction and the location
  391. * of the copy that will actually be run. The tricky bit here
  392. * is making sure that the sign extension happens correctly in
  393. * this calculation, since we need a signed 32-bit result to
  394. * be sign-extended to 64 bits when it's added to the %rip
  395. * value and yield the same 64-bit result that the sign-
  396. * extension of the original signed 32-bit displacement would
  397. * have given.
  398. */
  399. newdisp = (u8 *) src + (s64) insn->displacement.value
  400. - (u8 *) real;
  401. if ((s64) (s32) newdisp != newdisp) {
  402. pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
  403. return 0;
  404. }
  405. disp = (u8 *) dest + insn_offset_displacement(insn);
  406. *(s32 *) disp = (s32) newdisp;
  407. }
  408. #endif
  409. return insn->length;
  410. }
  411. /* Prepare reljump or int3 right after instruction */
  412. static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
  413. struct insn *insn)
  414. {
  415. int len = insn->length;
  416. if (!IS_ENABLED(CONFIG_PREEMPTION) &&
  417. !p->post_handler && can_boost(insn, p->addr) &&
  418. MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
  419. /*
  420. * These instructions can be executed directly if it
  421. * jumps back to correct address.
  422. */
  423. synthesize_reljump(buf + len, p->ainsn.insn + len,
  424. p->addr + insn->length);
  425. len += JMP32_INSN_SIZE;
  426. p->ainsn.boostable = 1;
  427. } else {
  428. /* Otherwise, put an int3 for trapping singlestep */
  429. if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
  430. return -ENOSPC;
  431. buf[len] = INT3_INSN_OPCODE;
  432. len += INT3_INSN_SIZE;
  433. }
  434. return len;
  435. }
  436. /* Make page to RO mode when allocate it */
  437. void *alloc_insn_page(void)
  438. {
  439. void *page;
  440. page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
  441. if (!page)
  442. return NULL;
  443. /*
  444. * TODO: Once additional kernel code protection mechanisms are set, ensure
  445. * that the page was not maliciously altered and it is still zeroed.
  446. */
  447. set_memory_rox((unsigned long)page, 1);
  448. return page;
  449. }
  450. /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
  451. static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
  452. {
  453. switch (p->ainsn.opcode) {
  454. case 0xfa: /* cli */
  455. regs->flags &= ~(X86_EFLAGS_IF);
  456. break;
  457. case 0xfb: /* sti */
  458. regs->flags |= X86_EFLAGS_IF;
  459. break;
  460. case 0x9c: /* pushf */
  461. int3_emulate_push(regs, regs->flags);
  462. break;
  463. case 0x9d: /* popf */
  464. regs->flags = int3_emulate_pop(regs);
  465. break;
  466. }
  467. regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
  468. }
  469. NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
  470. static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
  471. {
  472. int3_emulate_ret(regs);
  473. }
  474. NOKPROBE_SYMBOL(kprobe_emulate_ret);
  475. static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
  476. {
  477. unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
  478. func += p->ainsn.rel32;
  479. int3_emulate_call(regs, func);
  480. }
  481. NOKPROBE_SYMBOL(kprobe_emulate_call);
  482. static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
  483. {
  484. unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
  485. ip += p->ainsn.rel32;
  486. int3_emulate_jmp(regs, ip);
  487. }
  488. NOKPROBE_SYMBOL(kprobe_emulate_jmp);
  489. static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
  490. {
  491. unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
  492. int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
  493. }
  494. NOKPROBE_SYMBOL(kprobe_emulate_jcc);
  495. static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
  496. {
  497. unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
  498. bool match;
  499. if (p->ainsn.loop.type != 3) { /* LOOP* */
  500. if (p->ainsn.loop.asize == 32)
  501. match = ((*(u32 *)&regs->cx)--) != 0;
  502. #ifdef CONFIG_X86_64
  503. else if (p->ainsn.loop.asize == 64)
  504. match = ((*(u64 *)&regs->cx)--) != 0;
  505. #endif
  506. else
  507. match = ((*(u16 *)&regs->cx)--) != 0;
  508. } else { /* JCXZ */
  509. if (p->ainsn.loop.asize == 32)
  510. match = *(u32 *)(&regs->cx) == 0;
  511. #ifdef CONFIG_X86_64
  512. else if (p->ainsn.loop.asize == 64)
  513. match = *(u64 *)(&regs->cx) == 0;
  514. #endif
  515. else
  516. match = *(u16 *)(&regs->cx) == 0;
  517. }
  518. if (p->ainsn.loop.type == 0) /* LOOPNE */
  519. match = match && !(regs->flags & X86_EFLAGS_ZF);
  520. else if (p->ainsn.loop.type == 1) /* LOOPE */
  521. match = match && (regs->flags & X86_EFLAGS_ZF);
  522. if (match)
  523. ip += p->ainsn.rel32;
  524. int3_emulate_jmp(regs, ip);
  525. }
  526. NOKPROBE_SYMBOL(kprobe_emulate_loop);
  527. static const int addrmode_regoffs[] = {
  528. offsetof(struct pt_regs, ax),
  529. offsetof(struct pt_regs, cx),
  530. offsetof(struct pt_regs, dx),
  531. offsetof(struct pt_regs, bx),
  532. offsetof(struct pt_regs, sp),
  533. offsetof(struct pt_regs, bp),
  534. offsetof(struct pt_regs, si),
  535. offsetof(struct pt_regs, di),
  536. #ifdef CONFIG_X86_64
  537. offsetof(struct pt_regs, r8),
  538. offsetof(struct pt_regs, r9),
  539. offsetof(struct pt_regs, r10),
  540. offsetof(struct pt_regs, r11),
  541. offsetof(struct pt_regs, r12),
  542. offsetof(struct pt_regs, r13),
  543. offsetof(struct pt_regs, r14),
  544. offsetof(struct pt_regs, r15),
  545. #endif
  546. };
  547. static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
  548. {
  549. unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
  550. int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size);
  551. int3_emulate_jmp(regs, regs_get_register(regs, offs));
  552. }
  553. NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
  554. static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
  555. {
  556. unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
  557. int3_emulate_jmp(regs, regs_get_register(regs, offs));
  558. }
  559. NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
  560. static int prepare_emulation(struct kprobe *p, struct insn *insn)
  561. {
  562. insn_byte_t opcode = insn->opcode.bytes[0];
  563. switch (opcode) {
  564. case 0xfa: /* cli */
  565. case 0xfb: /* sti */
  566. case 0x9c: /* pushfl */
  567. case 0x9d: /* popf/popfd */
  568. /*
  569. * IF modifiers must be emulated since it will enable interrupt while
  570. * int3 single stepping.
  571. */
  572. p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
  573. p->ainsn.opcode = opcode;
  574. break;
  575. case 0xc2: /* ret/lret */
  576. case 0xc3:
  577. case 0xca:
  578. case 0xcb:
  579. p->ainsn.emulate_op = kprobe_emulate_ret;
  580. break;
  581. case 0x9a: /* far call absolute -- segment is not supported */
  582. case 0xea: /* far jmp absolute -- segment is not supported */
  583. case 0xcc: /* int3 */
  584. case 0xcf: /* iret -- in-kernel IRET is not supported */
  585. return -EOPNOTSUPP;
  586. break;
  587. case 0xe8: /* near call relative */
  588. p->ainsn.emulate_op = kprobe_emulate_call;
  589. if (insn->immediate.nbytes == 2)
  590. p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
  591. else
  592. p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
  593. break;
  594. case 0xeb: /* short jump relative */
  595. case 0xe9: /* near jump relative */
  596. p->ainsn.emulate_op = kprobe_emulate_jmp;
  597. if (insn->immediate.nbytes == 1)
  598. p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
  599. else if (insn->immediate.nbytes == 2)
  600. p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
  601. else
  602. p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
  603. break;
  604. case 0x70 ... 0x7f:
  605. /* 1 byte conditional jump */
  606. p->ainsn.emulate_op = kprobe_emulate_jcc;
  607. p->ainsn.jcc.type = opcode & 0xf;
  608. p->ainsn.rel32 = insn->immediate.value;
  609. break;
  610. case 0x0f:
  611. opcode = insn->opcode.bytes[1];
  612. if ((opcode & 0xf0) == 0x80) {
  613. /* 2 bytes Conditional Jump */
  614. p->ainsn.emulate_op = kprobe_emulate_jcc;
  615. p->ainsn.jcc.type = opcode & 0xf;
  616. if (insn->immediate.nbytes == 2)
  617. p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
  618. else
  619. p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
  620. } else if (opcode == 0x01 &&
  621. X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
  622. X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
  623. /* VM extensions - not supported */
  624. return -EOPNOTSUPP;
  625. }
  626. break;
  627. case 0xe0: /* Loop NZ */
  628. case 0xe1: /* Loop */
  629. case 0xe2: /* Loop */
  630. case 0xe3: /* J*CXZ */
  631. p->ainsn.emulate_op = kprobe_emulate_loop;
  632. p->ainsn.loop.type = opcode & 0x3;
  633. p->ainsn.loop.asize = insn->addr_bytes * 8;
  634. p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
  635. break;
  636. case 0xff:
  637. /*
  638. * Since the 0xff is an extended group opcode, the instruction
  639. * is determined by the MOD/RM byte.
  640. */
  641. opcode = insn->modrm.bytes[0];
  642. switch (X86_MODRM_REG(opcode)) {
  643. case 0b010: /* FF /2, call near, absolute indirect */
  644. p->ainsn.emulate_op = kprobe_emulate_call_indirect;
  645. break;
  646. case 0b100: /* FF /4, jmp near, absolute indirect */
  647. p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
  648. break;
  649. case 0b011: /* FF /3, call far, absolute indirect */
  650. case 0b101: /* FF /5, jmp far, absolute indirect */
  651. return -EOPNOTSUPP;
  652. }
  653. if (!p->ainsn.emulate_op)
  654. break;
  655. if (insn->addr_bytes != sizeof(unsigned long))
  656. return -EOPNOTSUPP; /* Don't support different size */
  657. if (X86_MODRM_MOD(opcode) != 3)
  658. return -EOPNOTSUPP; /* TODO: support memory addressing */
  659. p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
  660. #ifdef CONFIG_X86_64
  661. if (X86_REX_B(insn->rex_prefix.value))
  662. p->ainsn.indirect.reg += 8;
  663. #endif
  664. break;
  665. default:
  666. break;
  667. }
  668. p->ainsn.size = insn->length;
  669. return 0;
  670. }
  671. static int arch_copy_kprobe(struct kprobe *p)
  672. {
  673. struct insn insn;
  674. kprobe_opcode_t buf[MAX_INSN_SIZE];
  675. int ret, len;
  676. /* Copy an instruction with recovering if other optprobe modifies it.*/
  677. len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
  678. if (!len)
  679. return -EINVAL;
  680. /* Analyze the opcode and setup emulate functions */
  681. ret = prepare_emulation(p, &insn);
  682. if (ret < 0)
  683. return ret;
  684. /* Add int3 for single-step or booster jmp */
  685. len = prepare_singlestep(buf, p, &insn);
  686. if (len < 0)
  687. return len;
  688. /* Also, displacement change doesn't affect the first byte */
  689. p->opcode = buf[0];
  690. p->ainsn.tp_len = len;
  691. perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
  692. /* OK, write back the instruction(s) into ROX insn buffer */
  693. text_poke(p->ainsn.insn, buf, len);
  694. return 0;
  695. }
  696. int arch_prepare_kprobe(struct kprobe *p)
  697. {
  698. int ret;
  699. if (alternatives_text_reserved(p->addr, p->addr))
  700. return -EINVAL;
  701. if (!can_probe((unsigned long)p->addr))
  702. return -EILSEQ;
  703. memset(&p->ainsn, 0, sizeof(p->ainsn));
  704. /* insn: must be on special executable page on x86. */
  705. p->ainsn.insn = get_insn_slot();
  706. if (!p->ainsn.insn)
  707. return -ENOMEM;
  708. ret = arch_copy_kprobe(p);
  709. if (ret) {
  710. free_insn_slot(p->ainsn.insn, 0);
  711. p->ainsn.insn = NULL;
  712. }
  713. return ret;
  714. }
  715. void arch_arm_kprobe(struct kprobe *p)
  716. {
  717. u8 int3 = INT3_INSN_OPCODE;
  718. text_poke(p->addr, &int3, 1);
  719. text_poke_sync();
  720. perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
  721. }
  722. void arch_disarm_kprobe(struct kprobe *p)
  723. {
  724. u8 int3 = INT3_INSN_OPCODE;
  725. perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
  726. text_poke(p->addr, &p->opcode, 1);
  727. text_poke_sync();
  728. }
  729. void arch_remove_kprobe(struct kprobe *p)
  730. {
  731. if (p->ainsn.insn) {
  732. /* Record the perf event before freeing the slot */
  733. perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
  734. p->ainsn.tp_len, NULL, 0);
  735. free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
  736. p->ainsn.insn = NULL;
  737. }
  738. }
  739. static nokprobe_inline void
  740. save_previous_kprobe(struct kprobe_ctlblk *kcb)
  741. {
  742. kcb->prev_kprobe.kp = kprobe_running();
  743. kcb->prev_kprobe.status = kcb->kprobe_status;
  744. kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
  745. kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
  746. }
  747. static nokprobe_inline void
  748. restore_previous_kprobe(struct kprobe_ctlblk *kcb)
  749. {
  750. __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
  751. kcb->kprobe_status = kcb->prev_kprobe.status;
  752. kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
  753. kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
  754. }
  755. static nokprobe_inline void
  756. set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
  757. struct kprobe_ctlblk *kcb)
  758. {
  759. __this_cpu_write(current_kprobe, p);
  760. kcb->kprobe_saved_flags = kcb->kprobe_old_flags
  761. = (regs->flags & X86_EFLAGS_IF);
  762. }
  763. static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
  764. struct kprobe_ctlblk *kcb)
  765. {
  766. /* Restore back the original saved kprobes variables and continue. */
  767. if (kcb->kprobe_status == KPROBE_REENTER) {
  768. /* This will restore both kcb and current_kprobe */
  769. restore_previous_kprobe(kcb);
  770. } else {
  771. /*
  772. * Always update the kcb status because
  773. * reset_curent_kprobe() doesn't update kcb.
  774. */
  775. kcb->kprobe_status = KPROBE_HIT_SSDONE;
  776. if (cur->post_handler)
  777. cur->post_handler(cur, regs, 0);
  778. reset_current_kprobe();
  779. }
  780. }
  781. NOKPROBE_SYMBOL(kprobe_post_process);
  782. static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
  783. struct kprobe_ctlblk *kcb, int reenter)
  784. {
  785. if (setup_detour_execution(p, regs, reenter))
  786. return;
  787. #if !defined(CONFIG_PREEMPTION)
  788. if (p->ainsn.boostable) {
  789. /* Boost up -- we can execute copied instructions directly */
  790. if (!reenter)
  791. reset_current_kprobe();
  792. /*
  793. * Reentering boosted probe doesn't reset current_kprobe,
  794. * nor set current_kprobe, because it doesn't use single
  795. * stepping.
  796. */
  797. regs->ip = (unsigned long)p->ainsn.insn;
  798. return;
  799. }
  800. #endif
  801. if (reenter) {
  802. save_previous_kprobe(kcb);
  803. set_current_kprobe(p, regs, kcb);
  804. kcb->kprobe_status = KPROBE_REENTER;
  805. } else
  806. kcb->kprobe_status = KPROBE_HIT_SS;
  807. if (p->ainsn.emulate_op) {
  808. p->ainsn.emulate_op(p, regs);
  809. kprobe_post_process(p, regs, kcb);
  810. return;
  811. }
  812. /* Disable interrupt, and set ip register on trampoline */
  813. regs->flags &= ~X86_EFLAGS_IF;
  814. regs->ip = (unsigned long)p->ainsn.insn;
  815. }
  816. NOKPROBE_SYMBOL(setup_singlestep);
  817. /*
  818. * Called after single-stepping. p->addr is the address of the
  819. * instruction whose first byte has been replaced by the "int3"
  820. * instruction. To avoid the SMP problems that can occur when we
  821. * temporarily put back the original opcode to single-step, we
  822. * single-stepped a copy of the instruction. The address of this
  823. * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
  824. * right after the copied instruction.
  825. * Different from the trap single-step, "int3" single-step can not
  826. * handle the instruction which changes the ip register, e.g. jmp,
  827. * call, conditional jmp, and the instructions which changes the IF
  828. * flags because interrupt must be disabled around the single-stepping.
  829. * Such instructions are software emulated, but others are single-stepped
  830. * using "int3".
  831. *
  832. * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
  833. * be adjusted, so that we can resume execution on correct code.
  834. */
  835. static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
  836. struct kprobe_ctlblk *kcb)
  837. {
  838. unsigned long copy_ip = (unsigned long)p->ainsn.insn;
  839. unsigned long orig_ip = (unsigned long)p->addr;
  840. /* Restore saved interrupt flag and ip register */
  841. regs->flags |= kcb->kprobe_saved_flags;
  842. /* Note that regs->ip is executed int3 so must be a step back */
  843. regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
  844. }
  845. NOKPROBE_SYMBOL(resume_singlestep);
  846. /*
  847. * We have reentered the kprobe_handler(), since another probe was hit while
  848. * within the handler. We save the original kprobes variables and just single
  849. * step on the instruction of the new probe without calling any user handlers.
  850. */
  851. static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
  852. struct kprobe_ctlblk *kcb)
  853. {
  854. switch (kcb->kprobe_status) {
  855. case KPROBE_HIT_SSDONE:
  856. case KPROBE_HIT_ACTIVE:
  857. case KPROBE_HIT_SS:
  858. kprobes_inc_nmissed_count(p);
  859. setup_singlestep(p, regs, kcb, 1);
  860. break;
  861. case KPROBE_REENTER:
  862. /* A probe has been hit in the codepath leading up to, or just
  863. * after, single-stepping of a probed instruction. This entire
  864. * codepath should strictly reside in .kprobes.text section.
  865. * Raise a BUG or we'll continue in an endless reentering loop
  866. * and eventually a stack overflow.
  867. */
  868. pr_err("Unrecoverable kprobe detected.\n");
  869. dump_kprobe(p);
  870. BUG();
  871. default:
  872. /* impossible cases */
  873. WARN_ON(1);
  874. return 0;
  875. }
  876. return 1;
  877. }
  878. NOKPROBE_SYMBOL(reenter_kprobe);
  879. static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
  880. {
  881. return (kcb->kprobe_status == KPROBE_HIT_SS ||
  882. kcb->kprobe_status == KPROBE_REENTER);
  883. }
  884. /*
  885. * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  886. * remain disabled throughout this function.
  887. */
  888. int kprobe_int3_handler(struct pt_regs *regs)
  889. {
  890. kprobe_opcode_t *addr;
  891. struct kprobe *p;
  892. struct kprobe_ctlblk *kcb;
  893. if (user_mode(regs))
  894. return 0;
  895. addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
  896. /*
  897. * We don't want to be preempted for the entire duration of kprobe
  898. * processing. Since int3 and debug trap disables irqs and we clear
  899. * IF while singlestepping, it must be no preemptible.
  900. */
  901. kcb = get_kprobe_ctlblk();
  902. p = get_kprobe(addr);
  903. if (p) {
  904. if (kprobe_running()) {
  905. if (reenter_kprobe(p, regs, kcb))
  906. return 1;
  907. } else {
  908. set_current_kprobe(p, regs, kcb);
  909. kcb->kprobe_status = KPROBE_HIT_ACTIVE;
  910. /*
  911. * If we have no pre-handler or it returned 0, we
  912. * continue with normal processing. If we have a
  913. * pre-handler and it returned non-zero, that means
  914. * user handler setup registers to exit to another
  915. * instruction, we must skip the single stepping.
  916. */
  917. if (!p->pre_handler || !p->pre_handler(p, regs))
  918. setup_singlestep(p, regs, kcb, 0);
  919. else
  920. reset_current_kprobe();
  921. return 1;
  922. }
  923. } else if (kprobe_is_ss(kcb)) {
  924. p = kprobe_running();
  925. if ((unsigned long)p->ainsn.insn < regs->ip &&
  926. (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
  927. /* Most provably this is the second int3 for singlestep */
  928. resume_singlestep(p, regs, kcb);
  929. kprobe_post_process(p, regs, kcb);
  930. return 1;
  931. }
  932. } /* else: not a kprobe fault; let the kernel handle it */
  933. return 0;
  934. }
  935. NOKPROBE_SYMBOL(kprobe_int3_handler);
  936. int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
  937. {
  938. struct kprobe *cur = kprobe_running();
  939. struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
  940. if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
  941. /* This must happen on single-stepping */
  942. WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
  943. kcb->kprobe_status != KPROBE_REENTER);
  944. /*
  945. * We are here because the instruction being single
  946. * stepped caused a page fault. We reset the current
  947. * kprobe and the ip points back to the probe address
  948. * and allow the page fault handler to continue as a
  949. * normal page fault.
  950. */
  951. regs->ip = (unsigned long)cur->addr;
  952. /*
  953. * If the IF flag was set before the kprobe hit,
  954. * don't touch it:
  955. */
  956. regs->flags |= kcb->kprobe_old_flags;
  957. if (kcb->kprobe_status == KPROBE_REENTER)
  958. restore_previous_kprobe(kcb);
  959. else
  960. reset_current_kprobe();
  961. }
  962. return 0;
  963. }
  964. NOKPROBE_SYMBOL(kprobe_fault_handler);
  965. int __init arch_populate_kprobe_blacklist(void)
  966. {
  967. return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
  968. (unsigned long)__entry_text_end);
  969. }
  970. int __init arch_init_kprobes(void)
  971. {
  972. return 0;
  973. }
  974. int arch_trampoline_kprobe(struct kprobe *p)
  975. {
  976. return 0;
  977. }