kmmio.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* Support for MMIO probes.
  3. * Benefit many code from kprobes
  4. * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
  5. * 2007 Alexander Eichner
  6. * 2008 Pekka Paalanen <pq@iki.fi>
  7. */
  8. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  9. #include <linux/list.h>
  10. #include <linux/rculist.h>
  11. #include <linux/spinlock.h>
  12. #include <linux/hash.h>
  13. #include <linux/export.h>
  14. #include <linux/kernel.h>
  15. #include <linux/uaccess.h>
  16. #include <linux/ptrace.h>
  17. #include <linux/preempt.h>
  18. #include <linux/percpu.h>
  19. #include <linux/kdebug.h>
  20. #include <linux/mutex.h>
  21. #include <linux/io.h>
  22. #include <linux/slab.h>
  23. #include <asm/cacheflush.h>
  24. #include <asm/tlbflush.h>
  25. #include <linux/errno.h>
  26. #include <asm/debugreg.h>
  27. #include <linux/mmiotrace.h>
  28. #define KMMIO_PAGE_HASH_BITS 4
  29. #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
  30. struct kmmio_fault_page {
  31. struct list_head list;
  32. struct kmmio_fault_page *release_next;
  33. unsigned long addr; /* the requested address */
  34. pteval_t old_presence; /* page presence prior to arming */
  35. bool armed;
  36. /*
  37. * Number of times this page has been registered as a part
  38. * of a probe. If zero, page is disarmed and this may be freed.
  39. * Used only by writers (RCU) and post_kmmio_handler().
  40. * Protected by kmmio_lock, when linked into kmmio_page_table.
  41. */
  42. int count;
  43. bool scheduled_for_release;
  44. };
  45. struct kmmio_delayed_release {
  46. struct rcu_head rcu;
  47. struct kmmio_fault_page *release_list;
  48. };
  49. struct kmmio_context {
  50. struct kmmio_fault_page *fpage;
  51. struct kmmio_probe *probe;
  52. unsigned long saved_flags;
  53. unsigned long addr;
  54. int active;
  55. };
  56. /*
  57. * The kmmio_lock is taken in int3 context, which is treated as NMI context.
  58. * This causes lockdep to complain about it bein in both NMI and normal
  59. * context. Hide it from lockdep, as it should not have any other locks
  60. * taken under it, and this is only enabled for debugging mmio anyway.
  61. */
  62. static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
  63. /* Protected by kmmio_lock */
  64. unsigned int kmmio_count;
  65. /* Read-protected by RCU, write-protected by kmmio_lock. */
  66. static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
  67. static LIST_HEAD(kmmio_probes);
  68. static struct list_head *kmmio_page_list(unsigned long addr)
  69. {
  70. unsigned int l;
  71. pte_t *pte = lookup_address(addr, &l);
  72. if (!pte)
  73. return NULL;
  74. addr &= page_level_mask(l);
  75. return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
  76. }
  77. /* Accessed per-cpu */
  78. static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
  79. /*
  80. * this is basically a dynamic stabbing problem:
  81. * Could use the existing prio tree code or
  82. * Possible better implementations:
  83. * The Interval Skip List: A Data Structure for Finding All Intervals That
  84. * Overlap a Point (might be simple)
  85. * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
  86. */
  87. /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
  88. static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
  89. {
  90. struct kmmio_probe *p;
  91. list_for_each_entry_rcu(p, &kmmio_probes, list) {
  92. if (addr >= p->addr && addr < (p->addr + p->len))
  93. return p;
  94. }
  95. return NULL;
  96. }
  97. /* You must be holding RCU read lock. */
  98. static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
  99. {
  100. struct list_head *head;
  101. struct kmmio_fault_page *f;
  102. unsigned int l;
  103. pte_t *pte = lookup_address(addr, &l);
  104. if (!pte)
  105. return NULL;
  106. addr &= page_level_mask(l);
  107. head = kmmio_page_list(addr);
  108. list_for_each_entry_rcu(f, head, list) {
  109. if (f->addr == addr)
  110. return f;
  111. }
  112. return NULL;
  113. }
  114. static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
  115. {
  116. pmd_t new_pmd;
  117. pmdval_t v = pmd_val(*pmd);
  118. if (clear) {
  119. *old = v;
  120. new_pmd = pmd_mkinvalid(*pmd);
  121. } else {
  122. /* Presume this has been called with clear==true previously */
  123. new_pmd = __pmd(*old);
  124. }
  125. set_pmd(pmd, new_pmd);
  126. }
  127. static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
  128. {
  129. pteval_t v = pte_val(*pte);
  130. if (clear) {
  131. *old = v;
  132. /* Nothing should care about address */
  133. pte_clear(&init_mm, 0, pte);
  134. } else {
  135. /* Presume this has been called with clear==true previously */
  136. set_pte_atomic(pte, __pte(*old));
  137. }
  138. }
  139. static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
  140. {
  141. unsigned int level;
  142. pte_t *pte = lookup_address(f->addr, &level);
  143. if (!pte) {
  144. pr_err("no pte for addr 0x%08lx\n", f->addr);
  145. return -1;
  146. }
  147. switch (level) {
  148. case PG_LEVEL_2M:
  149. clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
  150. break;
  151. case PG_LEVEL_4K:
  152. clear_pte_presence(pte, clear, &f->old_presence);
  153. break;
  154. default:
  155. pr_err("unexpected page level 0x%x.\n", level);
  156. return -1;
  157. }
  158. flush_tlb_one_kernel(f->addr);
  159. return 0;
  160. }
  161. /*
  162. * Mark the given page as not present. Access to it will trigger a fault.
  163. *
  164. * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
  165. * protection is ignored here. RCU read lock is assumed held, so the struct
  166. * will not disappear unexpectedly. Furthermore, the caller must guarantee,
  167. * that double arming the same virtual address (page) cannot occur.
  168. *
  169. * Double disarming on the other hand is allowed, and may occur when a fault
  170. * and mmiotrace shutdown happen simultaneously.
  171. */
  172. static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
  173. {
  174. int ret;
  175. WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
  176. if (f->armed) {
  177. pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
  178. f->addr, f->count, !!f->old_presence);
  179. }
  180. ret = clear_page_presence(f, true);
  181. WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
  182. f->addr);
  183. f->armed = true;
  184. return ret;
  185. }
  186. /** Restore the given page to saved presence state. */
  187. static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
  188. {
  189. int ret = clear_page_presence(f, false);
  190. WARN_ONCE(ret < 0,
  191. KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
  192. f->armed = false;
  193. }
  194. /*
  195. * This is being called from do_page_fault().
  196. *
  197. * We may be in an interrupt or a critical section. Also prefecthing may
  198. * trigger a page fault. We may be in the middle of process switch.
  199. * We cannot take any locks, because we could be executing especially
  200. * within a kmmio critical section.
  201. *
  202. * Local interrupts are disabled, so preemption cannot happen.
  203. * Do not enable interrupts, do not sleep, and watch out for other CPUs.
  204. */
  205. /*
  206. * Interrupts are disabled on entry as trap3 is an interrupt gate
  207. * and they remain disabled throughout this function.
  208. */
  209. int kmmio_handler(struct pt_regs *regs, unsigned long addr)
  210. {
  211. struct kmmio_context *ctx;
  212. struct kmmio_fault_page *faultpage;
  213. int ret = 0; /* default to fault not handled */
  214. unsigned long page_base = addr;
  215. unsigned int l;
  216. pte_t *pte = lookup_address(addr, &l);
  217. if (!pte)
  218. return -EINVAL;
  219. page_base &= page_level_mask(l);
  220. /*
  221. * Hold the RCU read lock over single stepping to avoid looking
  222. * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
  223. * also disables preemption and prevents process switch during
  224. * the single stepping. We can only handle one active kmmio trace
  225. * per cpu, so ensure that we finish it before something else
  226. * gets to run.
  227. */
  228. rcu_read_lock_sched_notrace();
  229. faultpage = get_kmmio_fault_page(page_base);
  230. if (!faultpage) {
  231. /*
  232. * Either this page fault is not caused by kmmio, or
  233. * another CPU just pulled the kmmio probe from under
  234. * our feet. The latter case should not be possible.
  235. */
  236. goto no_kmmio;
  237. }
  238. ctx = this_cpu_ptr(&kmmio_ctx);
  239. if (ctx->active) {
  240. if (page_base == ctx->addr) {
  241. /*
  242. * A second fault on the same page means some other
  243. * condition needs handling by do_page_fault(), the
  244. * page really not being present is the most common.
  245. */
  246. pr_debug("secondary hit for 0x%08lx CPU %d.\n",
  247. addr, smp_processor_id());
  248. if (!faultpage->old_presence)
  249. pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
  250. addr, smp_processor_id());
  251. } else {
  252. /*
  253. * Prevent overwriting already in-flight context.
  254. * This should not happen, let's hope disarming at
  255. * least prevents a panic.
  256. */
  257. pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
  258. smp_processor_id(), addr);
  259. pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
  260. disarm_kmmio_fault_page(faultpage);
  261. }
  262. goto no_kmmio;
  263. }
  264. ctx->active++;
  265. ctx->fpage = faultpage;
  266. ctx->probe = get_kmmio_probe(page_base);
  267. ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
  268. ctx->addr = page_base;
  269. if (ctx->probe && ctx->probe->pre_handler)
  270. ctx->probe->pre_handler(ctx->probe, regs, addr);
  271. /*
  272. * Enable single-stepping and disable interrupts for the faulting
  273. * context. Local interrupts must not get enabled during stepping.
  274. */
  275. regs->flags |= X86_EFLAGS_TF;
  276. regs->flags &= ~X86_EFLAGS_IF;
  277. /* Now we set present bit in PTE and single step. */
  278. disarm_kmmio_fault_page(ctx->fpage);
  279. /*
  280. * If another cpu accesses the same page while we are stepping,
  281. * the access will not be caught. It will simply succeed and the
  282. * only downside is we lose the event. If this becomes a problem,
  283. * the user should drop to single cpu before tracing.
  284. */
  285. return 1; /* fault handled */
  286. no_kmmio:
  287. rcu_read_unlock_sched_notrace();
  288. return ret;
  289. }
  290. /*
  291. * Interrupts are disabled on entry as trap1 is an interrupt gate
  292. * and they remain disabled throughout this function.
  293. * This must always get called as the pair to kmmio_handler().
  294. */
  295. static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
  296. {
  297. int ret = 0;
  298. struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
  299. if (!ctx->active) {
  300. /*
  301. * debug traps without an active context are due to either
  302. * something external causing them (f.e. using a debugger while
  303. * mmio tracing enabled), or erroneous behaviour
  304. */
  305. pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
  306. goto out;
  307. }
  308. if (ctx->probe && ctx->probe->post_handler)
  309. ctx->probe->post_handler(ctx->probe, condition, regs);
  310. /* Prevent racing against release_kmmio_fault_page(). */
  311. arch_spin_lock(&kmmio_lock);
  312. if (ctx->fpage->count)
  313. arm_kmmio_fault_page(ctx->fpage);
  314. arch_spin_unlock(&kmmio_lock);
  315. regs->flags &= ~X86_EFLAGS_TF;
  316. regs->flags |= ctx->saved_flags;
  317. /* These were acquired in kmmio_handler(). */
  318. ctx->active--;
  319. BUG_ON(ctx->active);
  320. rcu_read_unlock_sched_notrace();
  321. /*
  322. * if somebody else is singlestepping across a probe point, flags
  323. * will have TF set, in which case, continue the remaining processing
  324. * of do_debug, as if this is not a probe hit.
  325. */
  326. if (!(regs->flags & X86_EFLAGS_TF))
  327. ret = 1;
  328. out:
  329. return ret;
  330. }
  331. /* You must be holding kmmio_lock. */
  332. static int add_kmmio_fault_page(unsigned long addr)
  333. {
  334. struct kmmio_fault_page *f;
  335. f = get_kmmio_fault_page(addr);
  336. if (f) {
  337. if (!f->count)
  338. arm_kmmio_fault_page(f);
  339. f->count++;
  340. return 0;
  341. }
  342. f = kzalloc(sizeof(*f), GFP_ATOMIC);
  343. if (!f)
  344. return -1;
  345. f->count = 1;
  346. f->addr = addr;
  347. if (arm_kmmio_fault_page(f)) {
  348. kfree(f);
  349. return -1;
  350. }
  351. list_add_rcu(&f->list, kmmio_page_list(f->addr));
  352. return 0;
  353. }
  354. /* You must be holding kmmio_lock. */
  355. static void release_kmmio_fault_page(unsigned long addr,
  356. struct kmmio_fault_page **release_list)
  357. {
  358. struct kmmio_fault_page *f;
  359. f = get_kmmio_fault_page(addr);
  360. if (!f)
  361. return;
  362. f->count--;
  363. BUG_ON(f->count < 0);
  364. if (!f->count) {
  365. disarm_kmmio_fault_page(f);
  366. if (!f->scheduled_for_release) {
  367. f->release_next = *release_list;
  368. *release_list = f;
  369. f->scheduled_for_release = true;
  370. }
  371. }
  372. }
  373. /*
  374. * With page-unaligned ioremaps, one or two armed pages may contain
  375. * addresses from outside the intended mapping. Events for these addresses
  376. * are currently silently dropped. The events may result only from programming
  377. * mistakes by accessing addresses before the beginning or past the end of a
  378. * mapping.
  379. */
  380. int register_kmmio_probe(struct kmmio_probe *p)
  381. {
  382. unsigned long flags;
  383. int ret = 0;
  384. unsigned long size = 0;
  385. unsigned long addr = p->addr & PAGE_MASK;
  386. const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
  387. unsigned int l;
  388. pte_t *pte;
  389. local_irq_save(flags);
  390. arch_spin_lock(&kmmio_lock);
  391. if (get_kmmio_probe(addr)) {
  392. ret = -EEXIST;
  393. goto out;
  394. }
  395. pte = lookup_address(addr, &l);
  396. if (!pte) {
  397. ret = -EINVAL;
  398. goto out;
  399. }
  400. kmmio_count++;
  401. list_add_rcu(&p->list, &kmmio_probes);
  402. while (size < size_lim) {
  403. if (add_kmmio_fault_page(addr + size))
  404. pr_err("Unable to set page fault.\n");
  405. size += page_level_size(l);
  406. }
  407. out:
  408. arch_spin_unlock(&kmmio_lock);
  409. local_irq_restore(flags);
  410. /*
  411. * XXX: What should I do here?
  412. * Here was a call to global_flush_tlb(), but it does not exist
  413. * anymore. It seems it's not needed after all.
  414. */
  415. return ret;
  416. }
  417. EXPORT_SYMBOL(register_kmmio_probe);
  418. static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
  419. {
  420. struct kmmio_delayed_release *dr = container_of(
  421. head,
  422. struct kmmio_delayed_release,
  423. rcu);
  424. struct kmmio_fault_page *f = dr->release_list;
  425. while (f) {
  426. struct kmmio_fault_page *next = f->release_next;
  427. BUG_ON(f->count);
  428. kfree(f);
  429. f = next;
  430. }
  431. kfree(dr);
  432. }
  433. static void remove_kmmio_fault_pages(struct rcu_head *head)
  434. {
  435. struct kmmio_delayed_release *dr =
  436. container_of(head, struct kmmio_delayed_release, rcu);
  437. struct kmmio_fault_page *f = dr->release_list;
  438. struct kmmio_fault_page **prevp = &dr->release_list;
  439. unsigned long flags;
  440. local_irq_save(flags);
  441. arch_spin_lock(&kmmio_lock);
  442. while (f) {
  443. if (!f->count) {
  444. list_del_rcu(&f->list);
  445. prevp = &f->release_next;
  446. } else {
  447. *prevp = f->release_next;
  448. f->release_next = NULL;
  449. f->scheduled_for_release = false;
  450. }
  451. f = *prevp;
  452. }
  453. arch_spin_unlock(&kmmio_lock);
  454. local_irq_restore(flags);
  455. /* This is the real RCU destroy call. */
  456. call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
  457. }
  458. /*
  459. * Remove a kmmio probe. You have to synchronize_rcu() before you can be
  460. * sure that the callbacks will not be called anymore. Only after that
  461. * you may actually release your struct kmmio_probe.
  462. *
  463. * Unregistering a kmmio fault page has three steps:
  464. * 1. release_kmmio_fault_page()
  465. * Disarm the page, wait a grace period to let all faults finish.
  466. * 2. remove_kmmio_fault_pages()
  467. * Remove the pages from kmmio_page_table.
  468. * 3. rcu_free_kmmio_fault_pages()
  469. * Actually free the kmmio_fault_page structs as with RCU.
  470. */
  471. void unregister_kmmio_probe(struct kmmio_probe *p)
  472. {
  473. unsigned long flags;
  474. unsigned long size = 0;
  475. unsigned long addr = p->addr & PAGE_MASK;
  476. const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
  477. struct kmmio_fault_page *release_list = NULL;
  478. struct kmmio_delayed_release *drelease;
  479. unsigned int l;
  480. pte_t *pte;
  481. pte = lookup_address(addr, &l);
  482. if (!pte)
  483. return;
  484. local_irq_save(flags);
  485. arch_spin_lock(&kmmio_lock);
  486. while (size < size_lim) {
  487. release_kmmio_fault_page(addr + size, &release_list);
  488. size += page_level_size(l);
  489. }
  490. list_del_rcu(&p->list);
  491. kmmio_count--;
  492. arch_spin_unlock(&kmmio_lock);
  493. local_irq_restore(flags);
  494. if (!release_list)
  495. return;
  496. drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
  497. if (!drelease) {
  498. pr_crit("leaking kmmio_fault_page objects.\n");
  499. return;
  500. }
  501. drelease->release_list = release_list;
  502. /*
  503. * This is not really RCU here. We have just disarmed a set of
  504. * pages so that they cannot trigger page faults anymore. However,
  505. * we cannot remove the pages from kmmio_page_table,
  506. * because a probe hit might be in flight on another CPU. The
  507. * pages are collected into a list, and they will be removed from
  508. * kmmio_page_table when it is certain that no probe hit related to
  509. * these pages can be in flight. RCU grace period sounds like a
  510. * good choice.
  511. *
  512. * If we removed the pages too early, kmmio page fault handler might
  513. * not find the respective kmmio_fault_page and determine it's not
  514. * a kmmio fault, when it actually is. This would lead to madness.
  515. */
  516. call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
  517. }
  518. EXPORT_SYMBOL(unregister_kmmio_probe);
  519. static int
  520. kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
  521. {
  522. struct die_args *arg = args;
  523. unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
  524. if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
  525. if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
  526. /*
  527. * Reset the BS bit in dr6 (pointed by args->err) to
  528. * denote completion of processing
  529. */
  530. *dr6_p &= ~DR_STEP;
  531. return NOTIFY_STOP;
  532. }
  533. return NOTIFY_DONE;
  534. }
  535. static struct notifier_block nb_die = {
  536. .notifier_call = kmmio_die_notifier
  537. };
  538. int kmmio_init(void)
  539. {
  540. int i;
  541. for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
  542. INIT_LIST_HEAD(&kmmio_page_table[i]);
  543. return register_die_notifier(&nb_die);
  544. }
  545. void kmmio_cleanup(void)
  546. {
  547. int i;
  548. unregister_die_notifier(&nb_die);
  549. for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
  550. WARN_ONCE(!list_empty(&kmmio_page_table[i]),
  551. KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
  552. }
  553. }