hiperdispatch.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright IBM Corp. 2024
  4. */
  5. #define KMSG_COMPONENT "hd"
  6. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  7. /*
  8. * Hiperdispatch:
  9. * Dynamically calculates the optimum number of high capacity COREs
  10. * by considering the state the system is in. When hiperdispatch decides
  11. * that a capacity update is necessary, it schedules a topology update.
  12. * During topology updates the CPU capacities are always re-adjusted.
  13. *
  14. * There is two places where CPU capacities are being accessed within
  15. * hiperdispatch.
  16. * -> hiperdispatch's reoccuring work function reads CPU capacities to
  17. * determine high capacity CPU count.
  18. * -> during a topology update hiperdispatch's adjustment function
  19. * updates CPU capacities.
  20. * These two can run on different CPUs in parallel which can cause
  21. * hiperdispatch to make wrong decisions. This can potentially cause
  22. * some overhead by leading to extra rebuild_sched_domains() calls
  23. * for correction. Access to capacities within hiperdispatch has to be
  24. * serialized to prevent the overhead.
  25. *
  26. * Hiperdispatch decision making revolves around steal time.
  27. * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
  28. * crosses the threshold value hiperdispatch falls back to giving high
  29. * capacities to entitled CPUs. When steal time drops below the
  30. * threshold boundary, hiperdispatch utilizes all CPUs by giving all
  31. * of them high capacity.
  32. *
  33. * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
  34. * performance. Comparing the throughput of;
  35. * - single CORE, with N threads, running N tasks
  36. * - N separate COREs running N tasks,
  37. * using individual COREs for individual tasks yield better
  38. * performance. This performance difference is roughly ~30% (can change
  39. * between machine generations)
  40. *
  41. * Hiperdispatch tries to hint scheduler to use individual COREs for
  42. * each task, as long as steal time on those COREs are less than 30%,
  43. * therefore delaying the throughput loss caused by using SMP threads.
  44. */
  45. #include <linux/cpumask.h>
  46. #include <linux/debugfs.h>
  47. #include <linux/device.h>
  48. #include <linux/kernel_stat.h>
  49. #include <linux/kstrtox.h>
  50. #include <linux/ktime.h>
  51. #include <linux/sysctl.h>
  52. #include <linux/types.h>
  53. #include <linux/workqueue.h>
  54. #include <asm/hiperdispatch.h>
  55. #include <asm/setup.h>
  56. #include <asm/smp.h>
  57. #include <asm/topology.h>
  58. #define CREATE_TRACE_POINTS
  59. #include <asm/trace/hiperdispatch.h>
  60. #define HD_DELAY_FACTOR (4)
  61. #define HD_DELAY_INTERVAL (HZ / 4)
  62. #define HD_STEAL_THRESHOLD 30
  63. #define HD_STEAL_AVG_WEIGHT 16
  64. static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
  65. static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
  66. static int hd_high_capacity_cores; /* Current CORE count with high capacity */
  67. static int hd_entitled_cores; /* Total vertical high and medium CORE count */
  68. static int hd_online_cores; /* Current online CORE count */
  69. static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
  70. static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
  71. static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
  72. static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */
  73. static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
  74. static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
  75. static int hd_enabled;
  76. static void hd_capacity_work_fn(struct work_struct *work);
  77. static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
  78. static int hd_set_hiperdispatch_mode(int enable)
  79. {
  80. if (!MACHINE_HAS_TOPOLOGY)
  81. enable = 0;
  82. if (hd_enabled == enable)
  83. return 0;
  84. hd_enabled = enable;
  85. return 1;
  86. }
  87. void hd_reset_state(void)
  88. {
  89. cpumask_clear(&hd_vl_coremask);
  90. cpumask_clear(&hd_vmvl_cpumask);
  91. hd_entitled_cores = 0;
  92. hd_online_cores = 0;
  93. }
  94. void hd_add_core(int cpu)
  95. {
  96. const struct cpumask *siblings;
  97. int polarization;
  98. hd_online_cores++;
  99. polarization = smp_cpu_get_polarization(cpu);
  100. siblings = topology_sibling_cpumask(cpu);
  101. switch (polarization) {
  102. case POLARIZATION_VH:
  103. hd_entitled_cores++;
  104. break;
  105. case POLARIZATION_VM:
  106. hd_entitled_cores++;
  107. cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
  108. break;
  109. case POLARIZATION_VL:
  110. cpumask_set_cpu(cpu, &hd_vl_coremask);
  111. cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
  112. break;
  113. }
  114. }
  115. /* Serialize update and read operations of debug counters. */
  116. static DEFINE_MUTEX(hd_counter_mutex);
  117. static void hd_update_times(void)
  118. {
  119. static ktime_t prev;
  120. ktime_t now;
  121. /*
  122. * Check if hiperdispatch is active, if not set the prev to 0.
  123. * This way it is possible to differentiate the first update iteration after
  124. * enabling hiperdispatch.
  125. */
  126. if (hd_entitled_cores == 0 || hd_enabled == 0) {
  127. prev = ktime_set(0, 0);
  128. return;
  129. }
  130. now = ktime_get();
  131. if (ktime_after(prev, 0)) {
  132. if (hd_high_capacity_cores == hd_online_cores)
  133. hd_high_time += ktime_ms_delta(now, prev);
  134. else
  135. hd_low_time += ktime_ms_delta(now, prev);
  136. }
  137. prev = now;
  138. }
  139. static void hd_update_capacities(void)
  140. {
  141. int cpu, upscaling_cores;
  142. unsigned long capacity;
  143. upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
  144. capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
  145. hd_high_capacity_cores = hd_entitled_cores;
  146. for_each_cpu(cpu, &hd_vl_coremask) {
  147. smp_set_core_capacity(cpu, capacity);
  148. if (capacity != CPU_CAPACITY_HIGH)
  149. continue;
  150. hd_high_capacity_cores++;
  151. upscaling_cores--;
  152. if (upscaling_cores == 0)
  153. capacity = CPU_CAPACITY_LOW;
  154. }
  155. }
  156. void hd_disable_hiperdispatch(void)
  157. {
  158. cancel_delayed_work_sync(&hd_capacity_work);
  159. hd_high_capacity_cores = hd_online_cores;
  160. hd_previous_steal = 0;
  161. }
  162. int hd_enable_hiperdispatch(void)
  163. {
  164. mutex_lock(&hd_counter_mutex);
  165. hd_update_times();
  166. mutex_unlock(&hd_counter_mutex);
  167. if (hd_enabled == 0)
  168. return 0;
  169. if (hd_entitled_cores == 0)
  170. return 0;
  171. if (hd_online_cores <= hd_entitled_cores)
  172. return 0;
  173. mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
  174. hd_update_capacities();
  175. return 1;
  176. }
  177. static unsigned long hd_steal_avg(unsigned long new)
  178. {
  179. static unsigned long steal;
  180. steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
  181. return steal;
  182. }
  183. static unsigned long hd_calculate_steal_percentage(void)
  184. {
  185. unsigned long time_delta, steal_delta, steal, percentage;
  186. static ktime_t prev;
  187. int cpus, cpu;
  188. ktime_t now;
  189. cpus = 0;
  190. steal = 0;
  191. percentage = 0;
  192. for_each_cpu(cpu, &hd_vmvl_cpumask) {
  193. steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
  194. cpus++;
  195. }
  196. /*
  197. * If there is no vertical medium and low CPUs steal time
  198. * is 0 as vertical high CPUs shouldn't experience steal time.
  199. */
  200. if (cpus == 0)
  201. return percentage;
  202. now = ktime_get();
  203. time_delta = ktime_to_ns(ktime_sub(now, prev));
  204. if (steal > hd_previous_steal && hd_previous_steal != 0) {
  205. steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
  206. percentage = steal_delta / cpus;
  207. }
  208. hd_previous_steal = steal;
  209. prev = now;
  210. return percentage;
  211. }
  212. static void hd_capacity_work_fn(struct work_struct *work)
  213. {
  214. unsigned long steal_percentage, new_cores;
  215. mutex_lock(&smp_cpu_state_mutex);
  216. /*
  217. * If online cores are less or equal to entitled cores hiperdispatch
  218. * does not need to make any adjustments, call a topology update to
  219. * disable hiperdispatch.
  220. * Normally this check is handled on topology update, but during cpu
  221. * unhotplug, topology and cpu mask updates are done in reverse
  222. * order, causing hd_enable_hiperdispatch() to get stale data.
  223. */
  224. if (hd_online_cores <= hd_entitled_cores) {
  225. topology_schedule_update();
  226. mutex_unlock(&smp_cpu_state_mutex);
  227. return;
  228. }
  229. steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
  230. if (steal_percentage < hd_steal_threshold)
  231. new_cores = hd_online_cores;
  232. else
  233. new_cores = hd_entitled_cores;
  234. if (hd_high_capacity_cores != new_cores) {
  235. trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
  236. hd_high_capacity_cores = new_cores;
  237. atomic64_inc(&hd_adjustments);
  238. topology_schedule_update();
  239. }
  240. trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
  241. mutex_unlock(&smp_cpu_state_mutex);
  242. schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
  243. }
  244. static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
  245. void *buffer, size_t *lenp, loff_t *ppos)
  246. {
  247. int hiperdispatch;
  248. int rc;
  249. struct ctl_table ctl_entry = {
  250. .procname = ctl->procname,
  251. .data = &hiperdispatch,
  252. .maxlen = sizeof(int),
  253. .extra1 = SYSCTL_ZERO,
  254. .extra2 = SYSCTL_ONE,
  255. };
  256. hiperdispatch = hd_enabled;
  257. rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
  258. if (rc < 0 || !write)
  259. return rc;
  260. mutex_lock(&smp_cpu_state_mutex);
  261. if (hd_set_hiperdispatch_mode(hiperdispatch))
  262. topology_schedule_update();
  263. mutex_unlock(&smp_cpu_state_mutex);
  264. return 0;
  265. }
  266. static struct ctl_table hiperdispatch_ctl_table[] = {
  267. {
  268. .procname = "hiperdispatch",
  269. .mode = 0644,
  270. .proc_handler = hiperdispatch_ctl_handler,
  271. },
  272. };
  273. static ssize_t hd_steal_threshold_show(struct device *dev,
  274. struct device_attribute *attr,
  275. char *buf)
  276. {
  277. return sysfs_emit(buf, "%u\n", hd_steal_threshold);
  278. }
  279. static ssize_t hd_steal_threshold_store(struct device *dev,
  280. struct device_attribute *attr,
  281. const char *buf,
  282. size_t count)
  283. {
  284. unsigned int val;
  285. int rc;
  286. rc = kstrtouint(buf, 0, &val);
  287. if (rc)
  288. return rc;
  289. if (val > 100)
  290. return -ERANGE;
  291. hd_steal_threshold = val;
  292. return count;
  293. }
  294. static DEVICE_ATTR_RW(hd_steal_threshold);
  295. static ssize_t hd_delay_factor_show(struct device *dev,
  296. struct device_attribute *attr,
  297. char *buf)
  298. {
  299. return sysfs_emit(buf, "%u\n", hd_delay_factor);
  300. }
  301. static ssize_t hd_delay_factor_store(struct device *dev,
  302. struct device_attribute *attr,
  303. const char *buf,
  304. size_t count)
  305. {
  306. unsigned int val;
  307. int rc;
  308. rc = kstrtouint(buf, 0, &val);
  309. if (rc)
  310. return rc;
  311. if (!val)
  312. return -ERANGE;
  313. hd_delay_factor = val;
  314. return count;
  315. }
  316. static DEVICE_ATTR_RW(hd_delay_factor);
  317. static struct attribute *hd_attrs[] = {
  318. &dev_attr_hd_steal_threshold.attr,
  319. &dev_attr_hd_delay_factor.attr,
  320. NULL,
  321. };
  322. static const struct attribute_group hd_attr_group = {
  323. .name = "hiperdispatch",
  324. .attrs = hd_attrs,
  325. };
  326. static int hd_greedy_time_get(void *unused, u64 *val)
  327. {
  328. mutex_lock(&hd_counter_mutex);
  329. hd_update_times();
  330. *val = hd_high_time;
  331. mutex_unlock(&hd_counter_mutex);
  332. return 0;
  333. }
  334. DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
  335. static int hd_conservative_time_get(void *unused, u64 *val)
  336. {
  337. mutex_lock(&hd_counter_mutex);
  338. hd_update_times();
  339. *val = hd_low_time;
  340. mutex_unlock(&hd_counter_mutex);
  341. return 0;
  342. }
  343. DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
  344. static int hd_adjustment_count_get(void *unused, u64 *val)
  345. {
  346. *val = atomic64_read(&hd_adjustments);
  347. return 0;
  348. }
  349. DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
  350. static void __init hd_create_debugfs_counters(void)
  351. {
  352. struct dentry *dir;
  353. dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
  354. debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
  355. debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
  356. debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
  357. }
  358. static void __init hd_create_attributes(void)
  359. {
  360. struct device *dev;
  361. dev = bus_get_dev_root(&cpu_subsys);
  362. if (!dev)
  363. return;
  364. if (sysfs_create_group(&dev->kobj, &hd_attr_group))
  365. pr_warn("Unable to create hiperdispatch attribute group\n");
  366. put_device(dev);
  367. }
  368. static int __init hd_init(void)
  369. {
  370. if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
  371. hd_set_hiperdispatch_mode(1);
  372. topology_schedule_update();
  373. }
  374. if (!register_sysctl("s390", hiperdispatch_ctl_table))
  375. pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
  376. hd_create_debugfs_counters();
  377. hd_create_attributes();
  378. return 0;
  379. }
  380. late_initcall(hd_init);