unaligned_access_speed.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright 2024 Rivos Inc.
  4. */
  5. #include <linux/cpu.h>
  6. #include <linux/cpumask.h>
  7. #include <linux/jump_label.h>
  8. #include <linux/mm.h>
  9. #include <linux/smp.h>
  10. #include <linux/types.h>
  11. #include <asm/cpufeature.h>
  12. #include <asm/hwprobe.h>
  13. #include "copy-unaligned.h"
  14. #define MISALIGNED_ACCESS_JIFFIES_LG2 1
  15. #define MISALIGNED_BUFFER_SIZE 0x4000
  16. #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
  17. #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
  18. DEFINE_PER_CPU(long, misaligned_access_speed);
  19. #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
  20. static cpumask_t fast_misaligned_access;
  21. static int check_unaligned_access(void *param)
  22. {
  23. int cpu = smp_processor_id();
  24. u64 start_cycles, end_cycles;
  25. u64 word_cycles;
  26. u64 byte_cycles;
  27. int ratio;
  28. unsigned long start_jiffies, now;
  29. struct page *page = param;
  30. void *dst;
  31. void *src;
  32. long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
  33. if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
  34. return 0;
  35. /* Make an unaligned destination buffer. */
  36. dst = (void *)((unsigned long)page_address(page) | 0x1);
  37. /* Unalign src as well, but differently (off by 1 + 2 = 3). */
  38. src = dst + (MISALIGNED_BUFFER_SIZE / 2);
  39. src += 2;
  40. word_cycles = -1ULL;
  41. /* Do a warmup. */
  42. __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
  43. preempt_disable();
  44. start_jiffies = jiffies;
  45. while ((now = jiffies) == start_jiffies)
  46. cpu_relax();
  47. /*
  48. * For a fixed amount of time, repeatedly try the function, and take
  49. * the best time in cycles as the measurement.
  50. */
  51. while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
  52. start_cycles = get_cycles64();
  53. /* Ensure the CSR read can't reorder WRT to the copy. */
  54. mb();
  55. __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
  56. /* Ensure the copy ends before the end time is snapped. */
  57. mb();
  58. end_cycles = get_cycles64();
  59. if ((end_cycles - start_cycles) < word_cycles)
  60. word_cycles = end_cycles - start_cycles;
  61. }
  62. byte_cycles = -1ULL;
  63. __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
  64. start_jiffies = jiffies;
  65. while ((now = jiffies) == start_jiffies)
  66. cpu_relax();
  67. while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
  68. start_cycles = get_cycles64();
  69. mb();
  70. __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
  71. mb();
  72. end_cycles = get_cycles64();
  73. if ((end_cycles - start_cycles) < byte_cycles)
  74. byte_cycles = end_cycles - start_cycles;
  75. }
  76. preempt_enable();
  77. /* Don't divide by zero. */
  78. if (!word_cycles || !byte_cycles) {
  79. pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
  80. cpu);
  81. return 0;
  82. }
  83. if (word_cycles < byte_cycles)
  84. speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
  85. ratio = div_u64((byte_cycles * 100), word_cycles);
  86. pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
  87. cpu,
  88. ratio / 100,
  89. ratio % 100,
  90. (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
  91. per_cpu(misaligned_access_speed, cpu) = speed;
  92. /*
  93. * Set the value of fast_misaligned_access of a CPU. These operations
  94. * are atomic to avoid race conditions.
  95. */
  96. if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
  97. cpumask_set_cpu(cpu, &fast_misaligned_access);
  98. else
  99. cpumask_clear_cpu(cpu, &fast_misaligned_access);
  100. return 0;
  101. }
  102. static void check_unaligned_access_nonboot_cpu(void *param)
  103. {
  104. unsigned int cpu = smp_processor_id();
  105. struct page **pages = param;
  106. if (smp_processor_id() != 0)
  107. check_unaligned_access(pages[cpu]);
  108. }
  109. DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
  110. static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
  111. {
  112. if (cpumask_weight(mask) == weight)
  113. static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
  114. else
  115. static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
  116. }
  117. static void set_unaligned_access_static_branches_except_cpu(int cpu)
  118. {
  119. /*
  120. * Same as set_unaligned_access_static_branches, except excludes the
  121. * given CPU from the result. When a CPU is hotplugged into an offline
  122. * state, this function is called before the CPU is set to offline in
  123. * the cpumask, and thus the CPU needs to be explicitly excluded.
  124. */
  125. cpumask_t fast_except_me;
  126. cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
  127. cpumask_clear_cpu(cpu, &fast_except_me);
  128. modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
  129. }
  130. static void set_unaligned_access_static_branches(void)
  131. {
  132. /*
  133. * This will be called after check_unaligned_access_all_cpus so the
  134. * result of unaligned access speed for all CPUs will be available.
  135. *
  136. * To avoid the number of online cpus changing between reading
  137. * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
  138. * held before calling this function.
  139. */
  140. cpumask_t fast_and_online;
  141. cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
  142. modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
  143. }
  144. static int lock_and_set_unaligned_access_static_branch(void)
  145. {
  146. cpus_read_lock();
  147. set_unaligned_access_static_branches();
  148. cpus_read_unlock();
  149. return 0;
  150. }
  151. arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
  152. static int riscv_online_cpu(unsigned int cpu)
  153. {
  154. static struct page *buf;
  155. /* We are already set since the last check */
  156. if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
  157. goto exit;
  158. check_unaligned_access_emulated(NULL);
  159. buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
  160. if (!buf) {
  161. pr_warn("Allocation failure, not measuring misaligned performance\n");
  162. return -ENOMEM;
  163. }
  164. check_unaligned_access(buf);
  165. __free_pages(buf, MISALIGNED_BUFFER_ORDER);
  166. exit:
  167. set_unaligned_access_static_branches();
  168. return 0;
  169. }
  170. static int riscv_offline_cpu(unsigned int cpu)
  171. {
  172. set_unaligned_access_static_branches_except_cpu(cpu);
  173. return 0;
  174. }
  175. /* Measure unaligned access speed on all CPUs present at boot in parallel. */
  176. static int check_unaligned_access_speed_all_cpus(void)
  177. {
  178. unsigned int cpu;
  179. unsigned int cpu_count = num_possible_cpus();
  180. struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
  181. if (!bufs) {
  182. pr_warn("Allocation failure, not measuring misaligned performance\n");
  183. return 0;
  184. }
  185. /*
  186. * Allocate separate buffers for each CPU so there's no fighting over
  187. * cache lines.
  188. */
  189. for_each_cpu(cpu, cpu_online_mask) {
  190. bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
  191. if (!bufs[cpu]) {
  192. pr_warn("Allocation failure, not measuring misaligned performance\n");
  193. goto out;
  194. }
  195. }
  196. /* Check everybody except 0, who stays behind to tend jiffies. */
  197. on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
  198. /* Check core 0. */
  199. smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
  200. /*
  201. * Setup hotplug callbacks for any new CPUs that come online or go
  202. * offline.
  203. */
  204. cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
  205. riscv_online_cpu, riscv_offline_cpu);
  206. out:
  207. for_each_cpu(cpu, cpu_online_mask) {
  208. if (bufs[cpu])
  209. __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
  210. }
  211. kfree(bufs);
  212. return 0;
  213. }
  214. static int check_unaligned_access_all_cpus(void)
  215. {
  216. bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
  217. if (!all_cpus_emulated)
  218. return check_unaligned_access_speed_all_cpus();
  219. return 0;
  220. }
  221. #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
  222. static int check_unaligned_access_all_cpus(void)
  223. {
  224. check_unaligned_access_emulated_all_cpus();
  225. return 0;
  226. }
  227. #endif
  228. arch_initcall(check_unaligned_access_all_cpus);