watchdog_buddy.c 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/cpu.h>
  3. #include <linux/cpumask.h>
  4. #include <linux/kernel.h>
  5. #include <linux/nmi.h>
  6. #include <linux/percpu-defs.h>
  7. static cpumask_t __read_mostly watchdog_cpus;
  8. static unsigned int watchdog_next_cpu(unsigned int cpu)
  9. {
  10. unsigned int next_cpu;
  11. next_cpu = cpumask_next(cpu, &watchdog_cpus);
  12. if (next_cpu >= nr_cpu_ids)
  13. next_cpu = cpumask_first(&watchdog_cpus);
  14. if (next_cpu == cpu)
  15. return nr_cpu_ids;
  16. return next_cpu;
  17. }
  18. int __init watchdog_hardlockup_probe(void)
  19. {
  20. return 0;
  21. }
  22. void watchdog_hardlockup_enable(unsigned int cpu)
  23. {
  24. unsigned int next_cpu;
  25. /*
  26. * The new CPU will be marked online before the hrtimer interrupt
  27. * gets a chance to run on it. If another CPU tests for a
  28. * hardlockup on the new CPU before it has run its the hrtimer
  29. * interrupt, it will get a false positive. Touch the watchdog on
  30. * the new CPU to delay the check for at least 3 sampling periods
  31. * to guarantee one hrtimer has run on the new CPU.
  32. */
  33. watchdog_hardlockup_touch_cpu(cpu);
  34. /*
  35. * We are going to check the next CPU. Our watchdog_hrtimer
  36. * need not be zero if the CPU has already been online earlier.
  37. * Touch the watchdog on the next CPU to avoid false positive
  38. * if we try to check it in less then 3 interrupts.
  39. */
  40. next_cpu = watchdog_next_cpu(cpu);
  41. if (next_cpu < nr_cpu_ids)
  42. watchdog_hardlockup_touch_cpu(next_cpu);
  43. /*
  44. * Makes sure that watchdog is touched on this CPU before
  45. * other CPUs could see it in watchdog_cpus. The counter
  46. * part is in watchdog_buddy_check_hardlockup().
  47. */
  48. smp_wmb();
  49. cpumask_set_cpu(cpu, &watchdog_cpus);
  50. }
  51. void watchdog_hardlockup_disable(unsigned int cpu)
  52. {
  53. unsigned int next_cpu = watchdog_next_cpu(cpu);
  54. /*
  55. * Offlining this CPU will cause the CPU before this one to start
  56. * checking the one after this one. If this CPU just finished checking
  57. * the next CPU and updating hrtimer_interrupts_saved, and then the
  58. * previous CPU checks it within one sample period, it will trigger a
  59. * false positive. Touch the watchdog on the next CPU to prevent it.
  60. */
  61. if (next_cpu < nr_cpu_ids)
  62. watchdog_hardlockup_touch_cpu(next_cpu);
  63. /*
  64. * Makes sure that watchdog is touched on the next CPU before
  65. * this CPU disappear in watchdog_cpus. The counter part is in
  66. * watchdog_buddy_check_hardlockup().
  67. */
  68. smp_wmb();
  69. cpumask_clear_cpu(cpu, &watchdog_cpus);
  70. }
  71. void watchdog_buddy_check_hardlockup(int hrtimer_interrupts)
  72. {
  73. unsigned int next_cpu;
  74. /*
  75. * Test for hardlockups every 3 samples. The sample period is
  76. * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
  77. * watchdog_thresh (over by 20%).
  78. */
  79. if (hrtimer_interrupts % 3 != 0)
  80. return;
  81. /* check for a hardlockup on the next CPU */
  82. next_cpu = watchdog_next_cpu(smp_processor_id());
  83. if (next_cpu >= nr_cpu_ids)
  84. return;
  85. /*
  86. * Make sure that the watchdog was touched on next CPU when
  87. * watchdog_next_cpu() returned another one because of
  88. * a change in watchdog_hardlockup_enable()/disable().
  89. */
  90. smp_rmb();
  91. watchdog_hardlockup_check(next_cpu, NULL);
  92. }