| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * Copyright IBM Corp. 2024
- */
- #define KMSG_COMPONENT "hd"
- #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
- /*
- * Hiperdispatch:
- * Dynamically calculates the optimum number of high capacity COREs
- * by considering the state the system is in. When hiperdispatch decides
- * that a capacity update is necessary, it schedules a topology update.
- * During topology updates the CPU capacities are always re-adjusted.
- *
- * There is two places where CPU capacities are being accessed within
- * hiperdispatch.
- * -> hiperdispatch's reoccuring work function reads CPU capacities to
- * determine high capacity CPU count.
- * -> during a topology update hiperdispatch's adjustment function
- * updates CPU capacities.
- * These two can run on different CPUs in parallel which can cause
- * hiperdispatch to make wrong decisions. This can potentially cause
- * some overhead by leading to extra rebuild_sched_domains() calls
- * for correction. Access to capacities within hiperdispatch has to be
- * serialized to prevent the overhead.
- *
- * Hiperdispatch decision making revolves around steal time.
- * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
- * crosses the threshold value hiperdispatch falls back to giving high
- * capacities to entitled CPUs. When steal time drops below the
- * threshold boundary, hiperdispatch utilizes all CPUs by giving all
- * of them high capacity.
- *
- * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
- * performance. Comparing the throughput of;
- * - single CORE, with N threads, running N tasks
- * - N separate COREs running N tasks,
- * using individual COREs for individual tasks yield better
- * performance. This performance difference is roughly ~30% (can change
- * between machine generations)
- *
- * Hiperdispatch tries to hint scheduler to use individual COREs for
- * each task, as long as steal time on those COREs are less than 30%,
- * therefore delaying the throughput loss caused by using SMP threads.
- */
- #include <linux/cpumask.h>
- #include <linux/debugfs.h>
- #include <linux/device.h>
- #include <linux/kernel_stat.h>
- #include <linux/kstrtox.h>
- #include <linux/ktime.h>
- #include <linux/sysctl.h>
- #include <linux/types.h>
- #include <linux/workqueue.h>
- #include <asm/hiperdispatch.h>
- #include <asm/setup.h>
- #include <asm/smp.h>
- #include <asm/topology.h>
- #define CREATE_TRACE_POINTS
- #include <asm/trace/hiperdispatch.h>
- #define HD_DELAY_FACTOR (4)
- #define HD_DELAY_INTERVAL (HZ / 4)
- #define HD_STEAL_THRESHOLD 30
- #define HD_STEAL_AVG_WEIGHT 16
- static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
- static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
- static int hd_high_capacity_cores; /* Current CORE count with high capacity */
- static int hd_entitled_cores; /* Total vertical high and medium CORE count */
- static int hd_online_cores; /* Current online CORE count */
- static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
- static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
- static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
- static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */
- static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
- static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
- static int hd_enabled;
- static void hd_capacity_work_fn(struct work_struct *work);
- static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
- static int hd_set_hiperdispatch_mode(int enable)
- {
- if (!MACHINE_HAS_TOPOLOGY)
- enable = 0;
- if (hd_enabled == enable)
- return 0;
- hd_enabled = enable;
- return 1;
- }
- void hd_reset_state(void)
- {
- cpumask_clear(&hd_vl_coremask);
- cpumask_clear(&hd_vmvl_cpumask);
- hd_entitled_cores = 0;
- hd_online_cores = 0;
- }
- void hd_add_core(int cpu)
- {
- const struct cpumask *siblings;
- int polarization;
- hd_online_cores++;
- polarization = smp_cpu_get_polarization(cpu);
- siblings = topology_sibling_cpumask(cpu);
- switch (polarization) {
- case POLARIZATION_VH:
- hd_entitled_cores++;
- break;
- case POLARIZATION_VM:
- hd_entitled_cores++;
- cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
- break;
- case POLARIZATION_VL:
- cpumask_set_cpu(cpu, &hd_vl_coremask);
- cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
- break;
- }
- }
- /* Serialize update and read operations of debug counters. */
- static DEFINE_MUTEX(hd_counter_mutex);
- static void hd_update_times(void)
- {
- static ktime_t prev;
- ktime_t now;
- /*
- * Check if hiperdispatch is active, if not set the prev to 0.
- * This way it is possible to differentiate the first update iteration after
- * enabling hiperdispatch.
- */
- if (hd_entitled_cores == 0 || hd_enabled == 0) {
- prev = ktime_set(0, 0);
- return;
- }
- now = ktime_get();
- if (ktime_after(prev, 0)) {
- if (hd_high_capacity_cores == hd_online_cores)
- hd_high_time += ktime_ms_delta(now, prev);
- else
- hd_low_time += ktime_ms_delta(now, prev);
- }
- prev = now;
- }
- static void hd_update_capacities(void)
- {
- int cpu, upscaling_cores;
- unsigned long capacity;
- upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
- capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
- hd_high_capacity_cores = hd_entitled_cores;
- for_each_cpu(cpu, &hd_vl_coremask) {
- smp_set_core_capacity(cpu, capacity);
- if (capacity != CPU_CAPACITY_HIGH)
- continue;
- hd_high_capacity_cores++;
- upscaling_cores--;
- if (upscaling_cores == 0)
- capacity = CPU_CAPACITY_LOW;
- }
- }
- void hd_disable_hiperdispatch(void)
- {
- cancel_delayed_work_sync(&hd_capacity_work);
- hd_high_capacity_cores = hd_online_cores;
- hd_previous_steal = 0;
- }
- int hd_enable_hiperdispatch(void)
- {
- mutex_lock(&hd_counter_mutex);
- hd_update_times();
- mutex_unlock(&hd_counter_mutex);
- if (hd_enabled == 0)
- return 0;
- if (hd_entitled_cores == 0)
- return 0;
- if (hd_online_cores <= hd_entitled_cores)
- return 0;
- mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
- hd_update_capacities();
- return 1;
- }
- static unsigned long hd_steal_avg(unsigned long new)
- {
- static unsigned long steal;
- steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
- return steal;
- }
- static unsigned long hd_calculate_steal_percentage(void)
- {
- unsigned long time_delta, steal_delta, steal, percentage;
- static ktime_t prev;
- int cpus, cpu;
- ktime_t now;
- cpus = 0;
- steal = 0;
- percentage = 0;
- for_each_cpu(cpu, &hd_vmvl_cpumask) {
- steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
- cpus++;
- }
- /*
- * If there is no vertical medium and low CPUs steal time
- * is 0 as vertical high CPUs shouldn't experience steal time.
- */
- if (cpus == 0)
- return percentage;
- now = ktime_get();
- time_delta = ktime_to_ns(ktime_sub(now, prev));
- if (steal > hd_previous_steal && hd_previous_steal != 0) {
- steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
- percentage = steal_delta / cpus;
- }
- hd_previous_steal = steal;
- prev = now;
- return percentage;
- }
- static void hd_capacity_work_fn(struct work_struct *work)
- {
- unsigned long steal_percentage, new_cores;
- mutex_lock(&smp_cpu_state_mutex);
- /*
- * If online cores are less or equal to entitled cores hiperdispatch
- * does not need to make any adjustments, call a topology update to
- * disable hiperdispatch.
- * Normally this check is handled on topology update, but during cpu
- * unhotplug, topology and cpu mask updates are done in reverse
- * order, causing hd_enable_hiperdispatch() to get stale data.
- */
- if (hd_online_cores <= hd_entitled_cores) {
- topology_schedule_update();
- mutex_unlock(&smp_cpu_state_mutex);
- return;
- }
- steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
- if (steal_percentage < hd_steal_threshold)
- new_cores = hd_online_cores;
- else
- new_cores = hd_entitled_cores;
- if (hd_high_capacity_cores != new_cores) {
- trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
- hd_high_capacity_cores = new_cores;
- atomic64_inc(&hd_adjustments);
- topology_schedule_update();
- }
- trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
- mutex_unlock(&smp_cpu_state_mutex);
- schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
- }
- static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
- {
- int hiperdispatch;
- int rc;
- struct ctl_table ctl_entry = {
- .procname = ctl->procname,
- .data = &hiperdispatch,
- .maxlen = sizeof(int),
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
- hiperdispatch = hd_enabled;
- rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
- if (rc < 0 || !write)
- return rc;
- mutex_lock(&smp_cpu_state_mutex);
- if (hd_set_hiperdispatch_mode(hiperdispatch))
- topology_schedule_update();
- mutex_unlock(&smp_cpu_state_mutex);
- return 0;
- }
- static struct ctl_table hiperdispatch_ctl_table[] = {
- {
- .procname = "hiperdispatch",
- .mode = 0644,
- .proc_handler = hiperdispatch_ctl_handler,
- },
- };
- static ssize_t hd_steal_threshold_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
- {
- return sysfs_emit(buf, "%u\n", hd_steal_threshold);
- }
- static ssize_t hd_steal_threshold_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t count)
- {
- unsigned int val;
- int rc;
- rc = kstrtouint(buf, 0, &val);
- if (rc)
- return rc;
- if (val > 100)
- return -ERANGE;
- hd_steal_threshold = val;
- return count;
- }
- static DEVICE_ATTR_RW(hd_steal_threshold);
- static ssize_t hd_delay_factor_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
- {
- return sysfs_emit(buf, "%u\n", hd_delay_factor);
- }
- static ssize_t hd_delay_factor_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t count)
- {
- unsigned int val;
- int rc;
- rc = kstrtouint(buf, 0, &val);
- if (rc)
- return rc;
- if (!val)
- return -ERANGE;
- hd_delay_factor = val;
- return count;
- }
- static DEVICE_ATTR_RW(hd_delay_factor);
- static struct attribute *hd_attrs[] = {
- &dev_attr_hd_steal_threshold.attr,
- &dev_attr_hd_delay_factor.attr,
- NULL,
- };
- static const struct attribute_group hd_attr_group = {
- .name = "hiperdispatch",
- .attrs = hd_attrs,
- };
- static int hd_greedy_time_get(void *unused, u64 *val)
- {
- mutex_lock(&hd_counter_mutex);
- hd_update_times();
- *val = hd_high_time;
- mutex_unlock(&hd_counter_mutex);
- return 0;
- }
- DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
- static int hd_conservative_time_get(void *unused, u64 *val)
- {
- mutex_lock(&hd_counter_mutex);
- hd_update_times();
- *val = hd_low_time;
- mutex_unlock(&hd_counter_mutex);
- return 0;
- }
- DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
- static int hd_adjustment_count_get(void *unused, u64 *val)
- {
- *val = atomic64_read(&hd_adjustments);
- return 0;
- }
- DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
- static void __init hd_create_debugfs_counters(void)
- {
- struct dentry *dir;
- dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
- debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
- debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
- debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
- }
- static void __init hd_create_attributes(void)
- {
- struct device *dev;
- dev = bus_get_dev_root(&cpu_subsys);
- if (!dev)
- return;
- if (sysfs_create_group(&dev->kobj, &hd_attr_group))
- pr_warn("Unable to create hiperdispatch attribute group\n");
- put_device(dev);
- }
- static int __init hd_init(void)
- {
- if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
- hd_set_hiperdispatch_mode(1);
- topology_schedule_update();
- }
- if (!register_sysctl("s390", hiperdispatch_ctl_table))
- pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
- hd_create_debugfs_counters();
- hd_create_attributes();
- return 0;
- }
- late_initcall(hd_init);
|