watchdog.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Watchdog support on powerpc systems.
  4. *
  5. * Copyright 2017, IBM Corporation.
  6. *
  7. * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
  8. */
  9. #define pr_fmt(fmt) "watchdog: " fmt
  10. #include <linux/kernel.h>
  11. #include <linux/param.h>
  12. #include <linux/init.h>
  13. #include <linux/percpu.h>
  14. #include <linux/cpu.h>
  15. #include <linux/nmi.h>
  16. #include <linux/module.h>
  17. #include <linux/export.h>
  18. #include <linux/kprobes.h>
  19. #include <linux/hardirq.h>
  20. #include <linux/reboot.h>
  21. #include <linux/slab.h>
  22. #include <linux/kdebug.h>
  23. #include <linux/sched/debug.h>
  24. #include <linux/delay.h>
  25. #include <linux/smp.h>
  26. #include <asm/paca.h>
  27. /*
  28. * The powerpc watchdog ensures that each CPU is able to service timers.
  29. * The watchdog sets up a simple timer on each CPU to run once per timer
  30. * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
  31. * the heartbeat.
  32. *
  33. * Then there are two systems to check that the heartbeat is still running.
  34. * The local soft-NMI, and the SMP checker.
  35. *
  36. * The soft-NMI checker can detect lockups on the local CPU. When interrupts
  37. * are disabled with local_irq_disable(), platforms that use soft-masking
  38. * can leave hardware interrupts enabled and handle them with a masked
  39. * interrupt handler. The masked handler can send the timer interrupt to the
  40. * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
  41. * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
  42. *
  43. * The soft-NMI checker will compare the heartbeat timestamp for this CPU
  44. * with the current time, and take action if the difference exceeds the
  45. * watchdog threshold.
  46. *
  47. * The limitation of the soft-NMI watchdog is that it does not work when
  48. * interrupts are hard disabled or otherwise not being serviced. This is
  49. * solved by also having a SMP watchdog where all CPUs check all other
  50. * CPUs heartbeat.
  51. *
  52. * The SMP checker can detect lockups on other CPUs. A gobal "pending"
  53. * cpumask is kept, containing all CPUs which enable the watchdog. Each
  54. * CPU clears their pending bit in their heartbeat timer. When the bitmask
  55. * becomes empty, the last CPU to clear its pending bit updates a global
  56. * timestamp and refills the pending bitmask.
  57. *
  58. * In the heartbeat timer, if any CPU notices that the global timestamp has
  59. * not been updated for a period exceeding the watchdog threshold, then it
  60. * means the CPU(s) with their bit still set in the pending mask have had
  61. * their heartbeat stop, and action is taken.
  62. *
  63. * Some platforms implement true NMI IPIs, which can be used by the SMP
  64. * watchdog to detect an unresponsive CPU and pull it out of its stuck
  65. * state with the NMI IPI, to get crash/debug data from it. This way the
  66. * SMP watchdog can detect hardware interrupts off lockups.
  67. */
  68. static cpumask_t wd_cpus_enabled __read_mostly;
  69. static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
  70. static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
  71. static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
  72. static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
  73. static DEFINE_PER_CPU(u64, wd_timer_tb);
  74. /* SMP checker bits */
  75. static unsigned long __wd_smp_lock;
  76. static cpumask_t wd_smp_cpus_pending;
  77. static cpumask_t wd_smp_cpus_stuck;
  78. static u64 wd_smp_last_reset_tb;
  79. static inline void wd_smp_lock(unsigned long *flags)
  80. {
  81. /*
  82. * Avoid locking layers if possible.
  83. * This may be called from low level interrupt handlers at some
  84. * point in future.
  85. */
  86. raw_local_irq_save(*flags);
  87. hard_irq_disable(); /* Make it soft-NMI safe */
  88. while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
  89. raw_local_irq_restore(*flags);
  90. spin_until_cond(!test_bit(0, &__wd_smp_lock));
  91. raw_local_irq_save(*flags);
  92. hard_irq_disable();
  93. }
  94. }
  95. static inline void wd_smp_unlock(unsigned long *flags)
  96. {
  97. clear_bit_unlock(0, &__wd_smp_lock);
  98. raw_local_irq_restore(*flags);
  99. }
  100. static void wd_lockup_ipi(struct pt_regs *regs)
  101. {
  102. int cpu = raw_smp_processor_id();
  103. u64 tb = get_tb();
  104. pr_emerg("CPU %d Hard LOCKUP\n", cpu);
  105. pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
  106. cpu, tb, per_cpu(wd_timer_tb, cpu),
  107. tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
  108. print_modules();
  109. print_irqtrace_events(current);
  110. if (regs)
  111. show_regs(regs);
  112. else
  113. dump_stack();
  114. /* Do not panic from here because that can recurse into NMI IPI layer */
  115. }
  116. static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
  117. {
  118. cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
  119. cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
  120. if (cpumask_empty(&wd_smp_cpus_pending)) {
  121. wd_smp_last_reset_tb = tb;
  122. cpumask_andnot(&wd_smp_cpus_pending,
  123. &wd_cpus_enabled,
  124. &wd_smp_cpus_stuck);
  125. }
  126. }
  127. static void set_cpu_stuck(int cpu, u64 tb)
  128. {
  129. set_cpumask_stuck(cpumask_of(cpu), tb);
  130. }
  131. static void watchdog_smp_panic(int cpu, u64 tb)
  132. {
  133. unsigned long flags;
  134. int c;
  135. wd_smp_lock(&flags);
  136. /* Double check some things under lock */
  137. if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
  138. goto out;
  139. if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
  140. goto out;
  141. if (cpumask_weight(&wd_smp_cpus_pending) == 0)
  142. goto out;
  143. pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
  144. cpu, cpumask_pr_args(&wd_smp_cpus_pending));
  145. pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
  146. cpu, tb, wd_smp_last_reset_tb,
  147. tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000);
  148. if (!sysctl_hardlockup_all_cpu_backtrace) {
  149. /*
  150. * Try to trigger the stuck CPUs, unless we are going to
  151. * get a backtrace on all of them anyway.
  152. */
  153. for_each_cpu(c, &wd_smp_cpus_pending) {
  154. if (c == cpu)
  155. continue;
  156. smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
  157. }
  158. }
  159. /* Take the stuck CPUs out of the watch group */
  160. set_cpumask_stuck(&wd_smp_cpus_pending, tb);
  161. wd_smp_unlock(&flags);
  162. printk_safe_flush();
  163. /*
  164. * printk_safe_flush() seems to require another print
  165. * before anything actually goes out to console.
  166. */
  167. if (sysctl_hardlockup_all_cpu_backtrace)
  168. trigger_allbutself_cpu_backtrace();
  169. if (hardlockup_panic)
  170. nmi_panic(NULL, "Hard LOCKUP");
  171. return;
  172. out:
  173. wd_smp_unlock(&flags);
  174. }
  175. static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
  176. {
  177. if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
  178. if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
  179. struct pt_regs *regs = get_irq_regs();
  180. unsigned long flags;
  181. wd_smp_lock(&flags);
  182. pr_emerg("CPU %d became unstuck TB:%lld\n",
  183. cpu, tb);
  184. print_irqtrace_events(current);
  185. if (regs)
  186. show_regs(regs);
  187. else
  188. dump_stack();
  189. cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
  190. wd_smp_unlock(&flags);
  191. }
  192. return;
  193. }
  194. cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
  195. if (cpumask_empty(&wd_smp_cpus_pending)) {
  196. unsigned long flags;
  197. wd_smp_lock(&flags);
  198. if (cpumask_empty(&wd_smp_cpus_pending)) {
  199. wd_smp_last_reset_tb = tb;
  200. cpumask_andnot(&wd_smp_cpus_pending,
  201. &wd_cpus_enabled,
  202. &wd_smp_cpus_stuck);
  203. }
  204. wd_smp_unlock(&flags);
  205. }
  206. }
  207. static void watchdog_timer_interrupt(int cpu)
  208. {
  209. u64 tb = get_tb();
  210. per_cpu(wd_timer_tb, cpu) = tb;
  211. wd_smp_clear_cpu_pending(cpu, tb);
  212. if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
  213. watchdog_smp_panic(cpu, tb);
  214. }
  215. void soft_nmi_interrupt(struct pt_regs *regs)
  216. {
  217. unsigned long flags;
  218. int cpu = raw_smp_processor_id();
  219. u64 tb;
  220. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  221. return;
  222. nmi_enter();
  223. __this_cpu_inc(irq_stat.soft_nmi_irqs);
  224. tb = get_tb();
  225. if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
  226. wd_smp_lock(&flags);
  227. if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
  228. wd_smp_unlock(&flags);
  229. goto out;
  230. }
  231. set_cpu_stuck(cpu, tb);
  232. pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
  233. cpu, (void *)regs->nip);
  234. pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
  235. cpu, tb, per_cpu(wd_timer_tb, cpu),
  236. tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
  237. print_modules();
  238. print_irqtrace_events(current);
  239. show_regs(regs);
  240. wd_smp_unlock(&flags);
  241. if (sysctl_hardlockup_all_cpu_backtrace)
  242. trigger_allbutself_cpu_backtrace();
  243. if (hardlockup_panic)
  244. nmi_panic(regs, "Hard LOCKUP");
  245. }
  246. if (wd_panic_timeout_tb < 0x7fffffff)
  247. mtspr(SPRN_DEC, wd_panic_timeout_tb);
  248. out:
  249. nmi_exit();
  250. }
  251. static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  252. {
  253. int cpu = smp_processor_id();
  254. if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
  255. return HRTIMER_NORESTART;
  256. if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
  257. return HRTIMER_NORESTART;
  258. watchdog_timer_interrupt(cpu);
  259. hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
  260. return HRTIMER_RESTART;
  261. }
  262. void arch_touch_nmi_watchdog(void)
  263. {
  264. unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
  265. int cpu = smp_processor_id();
  266. u64 tb = get_tb();
  267. if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
  268. per_cpu(wd_timer_tb, cpu) = tb;
  269. wd_smp_clear_cpu_pending(cpu, tb);
  270. }
  271. }
  272. EXPORT_SYMBOL(arch_touch_nmi_watchdog);
  273. static void start_watchdog(void *arg)
  274. {
  275. struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
  276. int cpu = smp_processor_id();
  277. unsigned long flags;
  278. if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
  279. WARN_ON(1);
  280. return;
  281. }
  282. if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
  283. return;
  284. if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
  285. return;
  286. wd_smp_lock(&flags);
  287. cpumask_set_cpu(cpu, &wd_cpus_enabled);
  288. if (cpumask_weight(&wd_cpus_enabled) == 1) {
  289. cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
  290. wd_smp_last_reset_tb = get_tb();
  291. }
  292. wd_smp_unlock(&flags);
  293. *this_cpu_ptr(&wd_timer_tb) = get_tb();
  294. hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  295. hrtimer->function = watchdog_timer_fn;
  296. hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
  297. HRTIMER_MODE_REL_PINNED);
  298. }
  299. static int start_watchdog_on_cpu(unsigned int cpu)
  300. {
  301. return smp_call_function_single(cpu, start_watchdog, NULL, true);
  302. }
  303. static void stop_watchdog(void *arg)
  304. {
  305. struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
  306. int cpu = smp_processor_id();
  307. unsigned long flags;
  308. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  309. return; /* Can happen in CPU unplug case */
  310. hrtimer_cancel(hrtimer);
  311. wd_smp_lock(&flags);
  312. cpumask_clear_cpu(cpu, &wd_cpus_enabled);
  313. wd_smp_unlock(&flags);
  314. wd_smp_clear_cpu_pending(cpu, get_tb());
  315. }
  316. static int stop_watchdog_on_cpu(unsigned int cpu)
  317. {
  318. return smp_call_function_single(cpu, stop_watchdog, NULL, true);
  319. }
  320. static void watchdog_calc_timeouts(void)
  321. {
  322. wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
  323. /* Have the SMP detector trigger a bit later */
  324. wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
  325. /* 2/5 is the factor that the perf based detector uses */
  326. wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
  327. }
  328. void watchdog_nmi_stop(void)
  329. {
  330. int cpu;
  331. for_each_cpu(cpu, &wd_cpus_enabled)
  332. stop_watchdog_on_cpu(cpu);
  333. }
  334. void watchdog_nmi_start(void)
  335. {
  336. int cpu;
  337. watchdog_calc_timeouts();
  338. for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
  339. start_watchdog_on_cpu(cpu);
  340. }
  341. /*
  342. * Invoked from core watchdog init.
  343. */
  344. int __init watchdog_nmi_probe(void)
  345. {
  346. int err;
  347. err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
  348. "powerpc/watchdog:online",
  349. start_watchdog_on_cpu,
  350. stop_watchdog_on_cpu);
  351. if (err < 0) {
  352. pr_warn("could not be initialized");
  353. return err;
  354. }
  355. return 0;
  356. }