tsc_sync.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * check TSC synchronization.
  4. *
  5. * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
  6. *
  7. * We check whether all boot CPUs have their TSC's synchronized,
  8. * print a warning if not and turn off the TSC clock-source.
  9. *
  10. * The warp-check is point-to-point between two CPUs, the CPU
  11. * initiating the bootup is the 'source CPU', the freshly booting
  12. * CPU is the 'target CPU'.
  13. *
  14. * Only two CPUs may participate - they can enter in any order.
  15. * ( The serial nature of the boot logic and the CPU hotplug lock
  16. * protects against more than 2 CPUs entering this code. )
  17. */
  18. #include <linux/workqueue.h>
  19. #include <linux/topology.h>
  20. #include <linux/spinlock.h>
  21. #include <linux/kernel.h>
  22. #include <linux/smp.h>
  23. #include <linux/nmi.h>
  24. #include <asm/tsc.h>
  25. struct tsc_adjust {
  26. s64 bootval;
  27. s64 adjusted;
  28. unsigned long nextcheck;
  29. bool warned;
  30. };
  31. static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
  32. static struct timer_list tsc_sync_check_timer;
  33. /*
  34. * TSC's on different sockets may be reset asynchronously.
  35. * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
  36. */
  37. bool __read_mostly tsc_async_resets;
  38. void mark_tsc_async_resets(char *reason)
  39. {
  40. if (tsc_async_resets)
  41. return;
  42. tsc_async_resets = true;
  43. pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
  44. }
  45. void tsc_verify_tsc_adjust(bool resume)
  46. {
  47. struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
  48. s64 curval;
  49. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  50. return;
  51. /* Skip unnecessary error messages if TSC already unstable */
  52. if (check_tsc_unstable())
  53. return;
  54. /* Rate limit the MSR check */
  55. if (!resume && time_before(jiffies, adj->nextcheck))
  56. return;
  57. adj->nextcheck = jiffies + HZ;
  58. rdmsrl(MSR_IA32_TSC_ADJUST, curval);
  59. if (adj->adjusted == curval)
  60. return;
  61. /* Restore the original value */
  62. wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
  63. if (!adj->warned || resume) {
  64. pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
  65. smp_processor_id(), adj->adjusted, curval);
  66. adj->warned = true;
  67. }
  68. }
  69. /*
  70. * Normally the tsc_sync will be checked every time system enters idle
  71. * state, but there is still caveat that a system won't enter idle,
  72. * either because it's too busy or configured purposely to not enter
  73. * idle.
  74. *
  75. * So setup a periodic timer (every 10 minutes) to make sure the check
  76. * is always on.
  77. */
  78. #define SYNC_CHECK_INTERVAL (HZ * 600)
  79. static void tsc_sync_check_timer_fn(struct timer_list *unused)
  80. {
  81. int next_cpu;
  82. tsc_verify_tsc_adjust(false);
  83. /* Run the check for all onlined CPUs in turn */
  84. next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
  85. if (next_cpu >= nr_cpu_ids)
  86. next_cpu = cpumask_first(cpu_online_mask);
  87. tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
  88. add_timer_on(&tsc_sync_check_timer, next_cpu);
  89. }
  90. static int __init start_sync_check_timer(void)
  91. {
  92. if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
  93. return 0;
  94. timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
  95. tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
  96. add_timer(&tsc_sync_check_timer);
  97. return 0;
  98. }
  99. late_initcall(start_sync_check_timer);
  100. static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
  101. unsigned int cpu, bool bootcpu)
  102. {
  103. /*
  104. * First online CPU in a package stores the boot value in the
  105. * adjustment value. This value might change later via the sync
  106. * mechanism. If that fails we still can yell about boot values not
  107. * being consistent.
  108. *
  109. * On the boot cpu we just force set the ADJUST value to 0 if it's
  110. * non zero. We don't do that on non boot cpus because physical
  111. * hotplug should have set the ADJUST register to a value > 0 so
  112. * the TSC is in sync with the already running cpus.
  113. *
  114. * Also don't force the ADJUST value to zero if that is a valid value
  115. * for socket 0 as determined by the system arch. This is required
  116. * when multiple sockets are reset asynchronously with each other
  117. * and socket 0 may not have an TSC ADJUST value of 0.
  118. */
  119. if (bootcpu && bootval != 0) {
  120. if (likely(!tsc_async_resets)) {
  121. pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
  122. cpu, bootval);
  123. wrmsrl(MSR_IA32_TSC_ADJUST, 0);
  124. bootval = 0;
  125. } else {
  126. pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
  127. cpu, bootval);
  128. }
  129. }
  130. cur->adjusted = bootval;
  131. }
  132. #ifndef CONFIG_SMP
  133. bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
  134. {
  135. struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
  136. s64 bootval;
  137. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  138. return false;
  139. /* Skip unnecessary error messages if TSC already unstable */
  140. if (check_tsc_unstable())
  141. return false;
  142. rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
  143. cur->bootval = bootval;
  144. cur->nextcheck = jiffies + HZ;
  145. tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
  146. return false;
  147. }
  148. #else /* !CONFIG_SMP */
  149. /*
  150. * Store and check the TSC ADJUST MSR if available
  151. */
  152. bool tsc_store_and_check_tsc_adjust(bool bootcpu)
  153. {
  154. struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
  155. unsigned int refcpu, cpu = smp_processor_id();
  156. struct cpumask *mask;
  157. s64 bootval;
  158. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  159. return false;
  160. rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
  161. cur->bootval = bootval;
  162. cur->nextcheck = jiffies + HZ;
  163. cur->warned = false;
  164. /*
  165. * The default adjust value cannot be assumed to be zero on any socket.
  166. */
  167. cur->adjusted = bootval;
  168. /*
  169. * Check whether this CPU is the first in a package to come up. In
  170. * this case do not check the boot value against another package
  171. * because the new package might have been physically hotplugged,
  172. * where TSC_ADJUST is expected to be different. When called on the
  173. * boot CPU topology_core_cpumask() might not be available yet.
  174. */
  175. mask = topology_core_cpumask(cpu);
  176. refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
  177. if (refcpu >= nr_cpu_ids) {
  178. tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
  179. bootcpu);
  180. return false;
  181. }
  182. ref = per_cpu_ptr(&tsc_adjust, refcpu);
  183. /*
  184. * Compare the boot value and complain if it differs in the
  185. * package.
  186. */
  187. if (bootval != ref->bootval)
  188. printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
  189. /*
  190. * The TSC_ADJUST values in a package must be the same. If the boot
  191. * value on this newly upcoming CPU differs from the adjustment
  192. * value of the already online CPU in this package, set it to that
  193. * adjusted value.
  194. */
  195. if (bootval != ref->adjusted) {
  196. cur->adjusted = ref->adjusted;
  197. wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
  198. }
  199. /*
  200. * We have the TSCs forced to be in sync on this package. Skip sync
  201. * test:
  202. */
  203. return true;
  204. }
  205. /*
  206. * Entry/exit counters that make sure that both CPUs
  207. * run the measurement code at once:
  208. */
  209. static atomic_t start_count;
  210. static atomic_t stop_count;
  211. static atomic_t test_runs;
  212. /*
  213. * We use a raw spinlock in this exceptional case, because
  214. * we want to have the fastest, inlined, non-debug version
  215. * of a critical section, to be able to prove TSC time-warps:
  216. */
  217. static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
  218. static cycles_t last_tsc;
  219. static cycles_t max_warp;
  220. static int nr_warps;
  221. static int random_warps;
  222. /*
  223. * TSC-warp measurement loop running on both CPUs. This is not called
  224. * if there is no TSC.
  225. */
  226. static cycles_t check_tsc_warp(unsigned int timeout)
  227. {
  228. cycles_t start, now, prev, end, cur_max_warp = 0;
  229. int i, cur_warps = 0;
  230. start = rdtsc_ordered();
  231. /*
  232. * The measurement runs for 'timeout' msecs:
  233. */
  234. end = start + (cycles_t) tsc_khz * timeout;
  235. for (i = 0; ; i++) {
  236. /*
  237. * We take the global lock, measure TSC, save the
  238. * previous TSC that was measured (possibly on
  239. * another CPU) and update the previous TSC timestamp.
  240. */
  241. arch_spin_lock(&sync_lock);
  242. prev = last_tsc;
  243. now = rdtsc_ordered();
  244. last_tsc = now;
  245. arch_spin_unlock(&sync_lock);
  246. /*
  247. * Be nice every now and then (and also check whether
  248. * measurement is done [we also insert a 10 million
  249. * loops safety exit, so we dont lock up in case the
  250. * TSC readout is totally broken]):
  251. */
  252. if (unlikely(!(i & 7))) {
  253. if (now > end || i > 10000000)
  254. break;
  255. cpu_relax();
  256. touch_nmi_watchdog();
  257. }
  258. /*
  259. * Outside the critical section we can now see whether
  260. * we saw a time-warp of the TSC going backwards:
  261. */
  262. if (unlikely(prev > now)) {
  263. arch_spin_lock(&sync_lock);
  264. max_warp = max(max_warp, prev - now);
  265. cur_max_warp = max_warp;
  266. /*
  267. * Check whether this bounces back and forth. Only
  268. * one CPU should observe time going backwards.
  269. */
  270. if (cur_warps != nr_warps)
  271. random_warps++;
  272. nr_warps++;
  273. cur_warps = nr_warps;
  274. arch_spin_unlock(&sync_lock);
  275. }
  276. }
  277. WARN(!(now-start),
  278. "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
  279. now-start, end-start);
  280. return cur_max_warp;
  281. }
  282. /*
  283. * If the target CPU coming online doesn't have any of its core-siblings
  284. * online, a timeout of 20msec will be used for the TSC-warp measurement
  285. * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
  286. * information about this socket already (and this information grows as we
  287. * have more and more logical-siblings in that socket).
  288. *
  289. * Ideally we should be able to skip the TSC sync check on the other
  290. * core-siblings, if the first logical CPU in a socket passed the sync test.
  291. * But as the TSC is per-logical CPU and can potentially be modified wrongly
  292. * by the bios, TSC sync test for smaller duration should be able
  293. * to catch such errors. Also this will catch the condition where all the
  294. * cores in the socket don't get reset at the same time.
  295. */
  296. static inline unsigned int loop_timeout(int cpu)
  297. {
  298. return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
  299. }
  300. static void tsc_sync_mark_tsc_unstable(struct work_struct *work)
  301. {
  302. mark_tsc_unstable("check_tsc_sync_source failed");
  303. }
  304. static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable);
  305. /*
  306. * The freshly booted CPU initiates this via an async SMP function call.
  307. */
  308. static void check_tsc_sync_source(void *__cpu)
  309. {
  310. unsigned int cpu = (unsigned long)__cpu;
  311. int cpus = 2;
  312. /*
  313. * Set the maximum number of test runs to
  314. * 1 if the CPU does not provide the TSC_ADJUST MSR
  315. * 3 if the MSR is available, so the target can try to adjust
  316. */
  317. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  318. atomic_set(&test_runs, 1);
  319. else
  320. atomic_set(&test_runs, 3);
  321. retry:
  322. /* Wait for the target to start. */
  323. while (atomic_read(&start_count) != cpus - 1)
  324. cpu_relax();
  325. /*
  326. * Trigger the target to continue into the measurement too:
  327. */
  328. atomic_inc(&start_count);
  329. check_tsc_warp(loop_timeout(cpu));
  330. while (atomic_read(&stop_count) != cpus-1)
  331. cpu_relax();
  332. /*
  333. * If the test was successful set the number of runs to zero and
  334. * stop. If not, decrement the number of runs an check if we can
  335. * retry. In case of random warps no retry is attempted.
  336. */
  337. if (!nr_warps) {
  338. atomic_set(&test_runs, 0);
  339. pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n",
  340. smp_processor_id(), cpu);
  341. } else if (atomic_dec_and_test(&test_runs) || random_warps) {
  342. /* Force it to 0 if random warps brought us here */
  343. atomic_set(&test_runs, 0);
  344. pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n",
  345. smp_processor_id(), cpu);
  346. pr_warn("Measured %Ld cycles TSC warp between CPUs, "
  347. "turning off TSC clock.\n", max_warp);
  348. if (random_warps)
  349. pr_warn("TSC warped randomly between CPUs\n");
  350. schedule_work(&tsc_sync_work);
  351. }
  352. /*
  353. * Reset it - just in case we boot another CPU later:
  354. */
  355. atomic_set(&start_count, 0);
  356. random_warps = 0;
  357. nr_warps = 0;
  358. max_warp = 0;
  359. last_tsc = 0;
  360. /*
  361. * Let the target continue with the bootup:
  362. */
  363. atomic_inc(&stop_count);
  364. /*
  365. * Retry, if there is a chance to do so.
  366. */
  367. if (atomic_read(&test_runs) > 0)
  368. goto retry;
  369. }
  370. /*
  371. * Freshly booted CPUs call into this:
  372. */
  373. void check_tsc_sync_target(void)
  374. {
  375. struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
  376. unsigned int cpu = smp_processor_id();
  377. cycles_t cur_max_warp, gbl_max_warp;
  378. int cpus = 2;
  379. /* Also aborts if there is no TSC. */
  380. if (unsynchronized_tsc())
  381. return;
  382. /*
  383. * Store, verify and sanitize the TSC adjust register. If
  384. * successful skip the test.
  385. *
  386. * The test is also skipped when the TSC is marked reliable. This
  387. * is true for SoCs which have no fallback clocksource. On these
  388. * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
  389. * register might have been wreckaged by the BIOS..
  390. */
  391. if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
  392. return;
  393. /* Kick the control CPU into the TSC synchronization function */
  394. smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source,
  395. (unsigned long *)(unsigned long)cpu, 0);
  396. retry:
  397. /*
  398. * Register this CPU's participation and wait for the
  399. * source CPU to start the measurement:
  400. */
  401. atomic_inc(&start_count);
  402. while (atomic_read(&start_count) != cpus)
  403. cpu_relax();
  404. cur_max_warp = check_tsc_warp(loop_timeout(cpu));
  405. /*
  406. * Store the maximum observed warp value for a potential retry:
  407. */
  408. gbl_max_warp = max_warp;
  409. /*
  410. * Ok, we are done:
  411. */
  412. atomic_inc(&stop_count);
  413. /*
  414. * Wait for the source CPU to print stuff:
  415. */
  416. while (atomic_read(&stop_count) != cpus)
  417. cpu_relax();
  418. /*
  419. * Reset it for the next sync test:
  420. */
  421. atomic_set(&stop_count, 0);
  422. /*
  423. * Check the number of remaining test runs. If not zero, the test
  424. * failed and a retry with adjusted TSC is possible. If zero the
  425. * test was either successful or failed terminally.
  426. */
  427. if (!atomic_read(&test_runs))
  428. return;
  429. /*
  430. * If the warp value of this CPU is 0, then the other CPU
  431. * observed time going backwards so this TSC was ahead and
  432. * needs to move backwards.
  433. */
  434. if (!cur_max_warp)
  435. cur_max_warp = -gbl_max_warp;
  436. /*
  437. * Add the result to the previous adjustment value.
  438. *
  439. * The adjustment value is slightly off by the overhead of the
  440. * sync mechanism (observed values are ~200 TSC cycles), but this
  441. * really depends on CPU, node distance and frequency. So
  442. * compensating for this is hard to get right. Experiments show
  443. * that the warp is not longer detectable when the observed warp
  444. * value is used. In the worst case the adjustment needs to go
  445. * through a 3rd run for fine tuning.
  446. */
  447. cur->adjusted += cur_max_warp;
  448. pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
  449. cpu, cur_max_warp, cur->adjusted);
  450. wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
  451. goto retry;
  452. }
  453. #endif /* CONFIG_SMP */