cpufreq_schedutil.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * CPUFreq governor based on scheduler-provided CPU utilization data.
  4. *
  5. * Copyright (C) 2016, Intel Corporation
  6. * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  7. */
  8. #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
  9. struct sugov_tunables {
  10. struct gov_attr_set attr_set;
  11. unsigned int rate_limit_us;
  12. };
  13. struct sugov_policy {
  14. struct cpufreq_policy *policy;
  15. struct sugov_tunables *tunables;
  16. struct list_head tunables_hook;
  17. raw_spinlock_t update_lock;
  18. u64 last_freq_update_time;
  19. s64 freq_update_delay_ns;
  20. unsigned int next_freq;
  21. unsigned int cached_raw_freq;
  22. /* The next fields are only needed if fast switch cannot be used: */
  23. struct irq_work irq_work;
  24. struct kthread_work work;
  25. struct mutex work_lock;
  26. struct kthread_worker worker;
  27. struct task_struct *thread;
  28. bool work_in_progress;
  29. bool limits_changed;
  30. bool need_freq_update;
  31. };
  32. struct sugov_cpu {
  33. struct update_util_data update_util;
  34. struct sugov_policy *sg_policy;
  35. unsigned int cpu;
  36. bool iowait_boost_pending;
  37. unsigned int iowait_boost;
  38. u64 last_update;
  39. unsigned long util;
  40. unsigned long bw_min;
  41. /* The field below is for single-CPU policies only: */
  42. #ifdef CONFIG_NO_HZ_COMMON
  43. unsigned long saved_idle_calls;
  44. #endif
  45. };
  46. static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  47. /************************ Governor internals ***********************/
  48. static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  49. {
  50. s64 delta_ns;
  51. /*
  52. * Since cpufreq_update_util() is called with rq->lock held for
  53. * the @target_cpu, our per-CPU data is fully serialized.
  54. *
  55. * However, drivers cannot in general deal with cross-CPU
  56. * requests, so while get_next_freq() will work, our
  57. * sugov_update_commit() call may not for the fast switching platforms.
  58. *
  59. * Hence stop here for remote requests if they aren't supported
  60. * by the hardware, as calculating the frequency is pointless if
  61. * we cannot in fact act on it.
  62. *
  63. * This is needed on the slow switching platforms too to prevent CPUs
  64. * going offline from leaving stale IRQ work items behind.
  65. */
  66. if (!cpufreq_this_cpu_can_update(sg_policy->policy))
  67. return false;
  68. if (unlikely(sg_policy->limits_changed)) {
  69. sg_policy->limits_changed = false;
  70. sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
  71. return true;
  72. }
  73. delta_ns = time - sg_policy->last_freq_update_time;
  74. return delta_ns >= sg_policy->freq_update_delay_ns;
  75. }
  76. static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
  77. unsigned int next_freq)
  78. {
  79. if (sg_policy->need_freq_update)
  80. sg_policy->need_freq_update = false;
  81. else if (sg_policy->next_freq == next_freq)
  82. return false;
  83. sg_policy->next_freq = next_freq;
  84. sg_policy->last_freq_update_time = time;
  85. return true;
  86. }
  87. static void sugov_deferred_update(struct sugov_policy *sg_policy)
  88. {
  89. if (!sg_policy->work_in_progress) {
  90. sg_policy->work_in_progress = true;
  91. irq_work_queue(&sg_policy->irq_work);
  92. }
  93. }
  94. /**
  95. * get_capacity_ref_freq - get the reference frequency that has been used to
  96. * correlate frequency and compute capacity for a given cpufreq policy. We use
  97. * the CPU managing it for the arch_scale_freq_ref() call in the function.
  98. * @policy: the cpufreq policy of the CPU in question.
  99. *
  100. * Return: the reference CPU frequency to compute a capacity.
  101. */
  102. static __always_inline
  103. unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
  104. {
  105. unsigned int freq = arch_scale_freq_ref(policy->cpu);
  106. if (freq)
  107. return freq;
  108. if (arch_scale_freq_invariant())
  109. return policy->cpuinfo.max_freq;
  110. /*
  111. * Apply a 25% margin so that we select a higher frequency than
  112. * the current one before the CPU is fully busy:
  113. */
  114. return policy->cur + (policy->cur >> 2);
  115. }
  116. /**
  117. * get_next_freq - Compute a new frequency for a given cpufreq policy.
  118. * @sg_policy: schedutil policy object to compute the new frequency for.
  119. * @util: Current CPU utilization.
  120. * @max: CPU capacity.
  121. *
  122. * If the utilization is frequency-invariant, choose the new frequency to be
  123. * proportional to it, that is
  124. *
  125. * next_freq = C * max_freq * util / max
  126. *
  127. * Otherwise, approximate the would-be frequency-invariant utilization by
  128. * util_raw * (curr_freq / max_freq) which leads to
  129. *
  130. * next_freq = C * curr_freq * util_raw / max
  131. *
  132. * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
  133. *
  134. * The lowest driver-supported frequency which is equal or greater than the raw
  135. * next_freq (as calculated above) is returned, subject to policy min/max and
  136. * cpufreq driver limitations.
  137. */
  138. static unsigned int get_next_freq(struct sugov_policy *sg_policy,
  139. unsigned long util, unsigned long max)
  140. {
  141. struct cpufreq_policy *policy = sg_policy->policy;
  142. unsigned int freq;
  143. freq = get_capacity_ref_freq(policy);
  144. freq = map_util_freq(util, freq, max);
  145. if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
  146. return sg_policy->next_freq;
  147. sg_policy->cached_raw_freq = freq;
  148. return cpufreq_driver_resolve_freq(policy, freq);
  149. }
  150. unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
  151. unsigned long min,
  152. unsigned long max)
  153. {
  154. /* Add dvfs headroom to actual utilization */
  155. actual = map_util_perf(actual);
  156. /* Actually we don't need to target the max performance */
  157. if (actual < max)
  158. max = actual;
  159. /*
  160. * Ensure at least minimum performance while providing more compute
  161. * capacity when possible.
  162. */
  163. return max(min, max);
  164. }
  165. static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
  166. {
  167. unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
  168. if (!scx_switched_all())
  169. util += cpu_util_cfs_boost(sg_cpu->cpu);
  170. util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
  171. util = max(util, boost);
  172. sg_cpu->bw_min = min;
  173. sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
  174. }
  175. /**
  176. * sugov_iowait_reset() - Reset the IO boost status of a CPU.
  177. * @sg_cpu: the sugov data for the CPU to boost
  178. * @time: the update time from the caller
  179. * @set_iowait_boost: true if an IO boost has been requested
  180. *
  181. * The IO wait boost of a task is disabled after a tick since the last update
  182. * of a CPU. If a new IO wait boost is requested after more then a tick, then
  183. * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
  184. * efficiency by ignoring sporadic wakeups from IO.
  185. */
  186. static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
  187. bool set_iowait_boost)
  188. {
  189. s64 delta_ns = time - sg_cpu->last_update;
  190. /* Reset boost only if a tick has elapsed since last request */
  191. if (delta_ns <= TICK_NSEC)
  192. return false;
  193. sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
  194. sg_cpu->iowait_boost_pending = set_iowait_boost;
  195. return true;
  196. }
  197. /**
  198. * sugov_iowait_boost() - Updates the IO boost status of a CPU.
  199. * @sg_cpu: the sugov data for the CPU to boost
  200. * @time: the update time from the caller
  201. * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
  202. *
  203. * Each time a task wakes up after an IO operation, the CPU utilization can be
  204. * boosted to a certain utilization which doubles at each "frequent and
  205. * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
  206. * of the maximum OPP.
  207. *
  208. * To keep doubling, an IO boost has to be requested at least once per tick,
  209. * otherwise we restart from the utilization of the minimum OPP.
  210. */
  211. static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
  212. unsigned int flags)
  213. {
  214. bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
  215. /* Reset boost if the CPU appears to have been idle enough */
  216. if (sg_cpu->iowait_boost &&
  217. sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
  218. return;
  219. /* Boost only tasks waking up after IO */
  220. if (!set_iowait_boost)
  221. return;
  222. /* Ensure boost doubles only one time at each request */
  223. if (sg_cpu->iowait_boost_pending)
  224. return;
  225. sg_cpu->iowait_boost_pending = true;
  226. /* Double the boost at each request */
  227. if (sg_cpu->iowait_boost) {
  228. sg_cpu->iowait_boost =
  229. min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
  230. return;
  231. }
  232. /* First wakeup after IO: start with minimum boost */
  233. sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
  234. }
  235. /**
  236. * sugov_iowait_apply() - Apply the IO boost to a CPU.
  237. * @sg_cpu: the sugov data for the cpu to boost
  238. * @time: the update time from the caller
  239. * @max_cap: the max CPU capacity
  240. *
  241. * A CPU running a task which woken up after an IO operation can have its
  242. * utilization boosted to speed up the completion of those IO operations.
  243. * The IO boost value is increased each time a task wakes up from IO, in
  244. * sugov_iowait_apply(), and it's instead decreased by this function,
  245. * each time an increase has not been requested (!iowait_boost_pending).
  246. *
  247. * A CPU which also appears to have been idle for at least one tick has also
  248. * its IO boost utilization reset.
  249. *
  250. * This mechanism is designed to boost high frequently IO waiting tasks, while
  251. * being more conservative on tasks which does sporadic IO operations.
  252. */
  253. static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
  254. unsigned long max_cap)
  255. {
  256. /* No boost currently required */
  257. if (!sg_cpu->iowait_boost)
  258. return 0;
  259. /* Reset boost if the CPU appears to have been idle enough */
  260. if (sugov_iowait_reset(sg_cpu, time, false))
  261. return 0;
  262. if (!sg_cpu->iowait_boost_pending) {
  263. /*
  264. * No boost pending; reduce the boost value.
  265. */
  266. sg_cpu->iowait_boost >>= 1;
  267. if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
  268. sg_cpu->iowait_boost = 0;
  269. return 0;
  270. }
  271. }
  272. sg_cpu->iowait_boost_pending = false;
  273. /*
  274. * sg_cpu->util is already in capacity scale; convert iowait_boost
  275. * into the same scale so we can compare.
  276. */
  277. return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
  278. }
  279. #ifdef CONFIG_NO_HZ_COMMON
  280. static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
  281. {
  282. unsigned long idle_calls;
  283. bool ret;
  284. /*
  285. * The heuristics in this function is for the fair class. For SCX, the
  286. * performance target comes directly from the BPF scheduler. Let's just
  287. * follow it.
  288. */
  289. if (scx_switched_all())
  290. return false;
  291. /* if capped by uclamp_max, always update to be in compliance */
  292. if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
  293. return false;
  294. /*
  295. * Maintain the frequency if the CPU has not been idle recently, as
  296. * reduction is likely to be premature.
  297. */
  298. idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
  299. ret = idle_calls == sg_cpu->saved_idle_calls;
  300. sg_cpu->saved_idle_calls = idle_calls;
  301. return ret;
  302. }
  303. #else
  304. static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
  305. #endif /* CONFIG_NO_HZ_COMMON */
  306. /*
  307. * Make sugov_should_update_freq() ignore the rate limit when DL
  308. * has increased the utilization.
  309. */
  310. static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
  311. {
  312. if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
  313. sg_cpu->sg_policy->limits_changed = true;
  314. }
  315. static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
  316. u64 time, unsigned long max_cap,
  317. unsigned int flags)
  318. {
  319. unsigned long boost;
  320. sugov_iowait_boost(sg_cpu, time, flags);
  321. sg_cpu->last_update = time;
  322. ignore_dl_rate_limit(sg_cpu);
  323. if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
  324. return false;
  325. boost = sugov_iowait_apply(sg_cpu, time, max_cap);
  326. sugov_get_util(sg_cpu, boost);
  327. return true;
  328. }
  329. static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
  330. unsigned int flags)
  331. {
  332. struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
  333. struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  334. unsigned int cached_freq = sg_policy->cached_raw_freq;
  335. unsigned long max_cap;
  336. unsigned int next_f;
  337. max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
  338. if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
  339. return;
  340. next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
  341. if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
  342. !sg_policy->need_freq_update) {
  343. next_f = sg_policy->next_freq;
  344. /* Restore cached freq as next_freq has changed */
  345. sg_policy->cached_raw_freq = cached_freq;
  346. }
  347. if (!sugov_update_next_freq(sg_policy, time, next_f))
  348. return;
  349. /*
  350. * This code runs under rq->lock for the target CPU, so it won't run
  351. * concurrently on two different CPUs for the same target and it is not
  352. * necessary to acquire the lock in the fast switch case.
  353. */
  354. if (sg_policy->policy->fast_switch_enabled) {
  355. cpufreq_driver_fast_switch(sg_policy->policy, next_f);
  356. } else {
  357. raw_spin_lock(&sg_policy->update_lock);
  358. sugov_deferred_update(sg_policy);
  359. raw_spin_unlock(&sg_policy->update_lock);
  360. }
  361. }
  362. static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
  363. unsigned int flags)
  364. {
  365. struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
  366. unsigned long prev_util = sg_cpu->util;
  367. unsigned long max_cap;
  368. /*
  369. * Fall back to the "frequency" path if frequency invariance is not
  370. * supported, because the direct mapping between the utilization and
  371. * the performance levels depends on the frequency invariance.
  372. */
  373. if (!arch_scale_freq_invariant()) {
  374. sugov_update_single_freq(hook, time, flags);
  375. return;
  376. }
  377. max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
  378. if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
  379. return;
  380. if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
  381. sg_cpu->util = prev_util;
  382. cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
  383. sg_cpu->util, max_cap);
  384. sg_cpu->sg_policy->last_freq_update_time = time;
  385. }
  386. static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
  387. {
  388. struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  389. struct cpufreq_policy *policy = sg_policy->policy;
  390. unsigned long util = 0, max_cap;
  391. unsigned int j;
  392. max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
  393. for_each_cpu(j, policy->cpus) {
  394. struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
  395. unsigned long boost;
  396. boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
  397. sugov_get_util(j_sg_cpu, boost);
  398. util = max(j_sg_cpu->util, util);
  399. }
  400. return get_next_freq(sg_policy, util, max_cap);
  401. }
  402. static void
  403. sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
  404. {
  405. struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
  406. struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  407. unsigned int next_f;
  408. raw_spin_lock(&sg_policy->update_lock);
  409. sugov_iowait_boost(sg_cpu, time, flags);
  410. sg_cpu->last_update = time;
  411. ignore_dl_rate_limit(sg_cpu);
  412. if (sugov_should_update_freq(sg_policy, time)) {
  413. next_f = sugov_next_freq_shared(sg_cpu, time);
  414. if (!sugov_update_next_freq(sg_policy, time, next_f))
  415. goto unlock;
  416. if (sg_policy->policy->fast_switch_enabled)
  417. cpufreq_driver_fast_switch(sg_policy->policy, next_f);
  418. else
  419. sugov_deferred_update(sg_policy);
  420. }
  421. unlock:
  422. raw_spin_unlock(&sg_policy->update_lock);
  423. }
  424. static void sugov_work(struct kthread_work *work)
  425. {
  426. struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
  427. unsigned int freq;
  428. unsigned long flags;
  429. /*
  430. * Hold sg_policy->update_lock shortly to handle the case where:
  431. * in case sg_policy->next_freq is read here, and then updated by
  432. * sugov_deferred_update() just before work_in_progress is set to false
  433. * here, we may miss queueing the new update.
  434. *
  435. * Note: If a work was queued after the update_lock is released,
  436. * sugov_work() will just be called again by kthread_work code; and the
  437. * request will be proceed before the sugov thread sleeps.
  438. */
  439. raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
  440. freq = sg_policy->next_freq;
  441. sg_policy->work_in_progress = false;
  442. raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
  443. mutex_lock(&sg_policy->work_lock);
  444. __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
  445. mutex_unlock(&sg_policy->work_lock);
  446. }
  447. static void sugov_irq_work(struct irq_work *irq_work)
  448. {
  449. struct sugov_policy *sg_policy;
  450. sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
  451. kthread_queue_work(&sg_policy->worker, &sg_policy->work);
  452. }
  453. /************************** sysfs interface ************************/
  454. static struct sugov_tunables *global_tunables;
  455. static DEFINE_MUTEX(global_tunables_lock);
  456. static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
  457. {
  458. return container_of(attr_set, struct sugov_tunables, attr_set);
  459. }
  460. static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
  461. {
  462. struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  463. return sprintf(buf, "%u\n", tunables->rate_limit_us);
  464. }
  465. static ssize_t
  466. rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
  467. {
  468. struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  469. struct sugov_policy *sg_policy;
  470. unsigned int rate_limit_us;
  471. if (kstrtouint(buf, 10, &rate_limit_us))
  472. return -EINVAL;
  473. tunables->rate_limit_us = rate_limit_us;
  474. list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
  475. sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
  476. return count;
  477. }
  478. static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
  479. static struct attribute *sugov_attrs[] = {
  480. &rate_limit_us.attr,
  481. NULL
  482. };
  483. ATTRIBUTE_GROUPS(sugov);
  484. static void sugov_tunables_free(struct kobject *kobj)
  485. {
  486. struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
  487. kfree(to_sugov_tunables(attr_set));
  488. }
  489. static const struct kobj_type sugov_tunables_ktype = {
  490. .default_groups = sugov_groups,
  491. .sysfs_ops = &governor_sysfs_ops,
  492. .release = &sugov_tunables_free,
  493. };
  494. /********************** cpufreq governor interface *********************/
  495. #ifdef CONFIG_ENERGY_MODEL
  496. static void rebuild_sd_workfn(struct work_struct *work)
  497. {
  498. rebuild_sched_domains_energy();
  499. }
  500. static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
  501. /*
  502. * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
  503. * on governor changes to make sure the scheduler knows about it.
  504. */
  505. static void sugov_eas_rebuild_sd(void)
  506. {
  507. /*
  508. * When called from the cpufreq_register_driver() path, the
  509. * cpu_hotplug_lock is already held, so use a work item to
  510. * avoid nested locking in rebuild_sched_domains().
  511. */
  512. schedule_work(&rebuild_sd_work);
  513. }
  514. #else
  515. static inline void sugov_eas_rebuild_sd(void) { };
  516. #endif
  517. struct cpufreq_governor schedutil_gov;
  518. static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
  519. {
  520. struct sugov_policy *sg_policy;
  521. sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
  522. if (!sg_policy)
  523. return NULL;
  524. sg_policy->policy = policy;
  525. raw_spin_lock_init(&sg_policy->update_lock);
  526. return sg_policy;
  527. }
  528. static void sugov_policy_free(struct sugov_policy *sg_policy)
  529. {
  530. kfree(sg_policy);
  531. }
  532. static int sugov_kthread_create(struct sugov_policy *sg_policy)
  533. {
  534. struct task_struct *thread;
  535. struct sched_attr attr = {
  536. .size = sizeof(struct sched_attr),
  537. .sched_policy = SCHED_DEADLINE,
  538. .sched_flags = SCHED_FLAG_SUGOV,
  539. .sched_nice = 0,
  540. .sched_priority = 0,
  541. /*
  542. * Fake (unused) bandwidth; workaround to "fix"
  543. * priority inheritance.
  544. */
  545. .sched_runtime = NSEC_PER_MSEC,
  546. .sched_deadline = 10 * NSEC_PER_MSEC,
  547. .sched_period = 10 * NSEC_PER_MSEC,
  548. };
  549. struct cpufreq_policy *policy = sg_policy->policy;
  550. int ret;
  551. /* kthread only required for slow path */
  552. if (policy->fast_switch_enabled)
  553. return 0;
  554. kthread_init_work(&sg_policy->work, sugov_work);
  555. kthread_init_worker(&sg_policy->worker);
  556. thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
  557. "sugov:%d",
  558. cpumask_first(policy->related_cpus));
  559. if (IS_ERR(thread)) {
  560. pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
  561. return PTR_ERR(thread);
  562. }
  563. ret = sched_setattr_nocheck(thread, &attr);
  564. if (ret) {
  565. kthread_stop(thread);
  566. pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
  567. return ret;
  568. }
  569. sg_policy->thread = thread;
  570. kthread_bind_mask(thread, policy->related_cpus);
  571. init_irq_work(&sg_policy->irq_work, sugov_irq_work);
  572. mutex_init(&sg_policy->work_lock);
  573. wake_up_process(thread);
  574. return 0;
  575. }
  576. static void sugov_kthread_stop(struct sugov_policy *sg_policy)
  577. {
  578. /* kthread only required for slow path */
  579. if (sg_policy->policy->fast_switch_enabled)
  580. return;
  581. kthread_flush_worker(&sg_policy->worker);
  582. kthread_stop(sg_policy->thread);
  583. mutex_destroy(&sg_policy->work_lock);
  584. }
  585. static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
  586. {
  587. struct sugov_tunables *tunables;
  588. tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
  589. if (tunables) {
  590. gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
  591. if (!have_governor_per_policy())
  592. global_tunables = tunables;
  593. }
  594. return tunables;
  595. }
  596. static void sugov_clear_global_tunables(void)
  597. {
  598. if (!have_governor_per_policy())
  599. global_tunables = NULL;
  600. }
  601. static int sugov_init(struct cpufreq_policy *policy)
  602. {
  603. struct sugov_policy *sg_policy;
  604. struct sugov_tunables *tunables;
  605. int ret = 0;
  606. /* State should be equivalent to EXIT */
  607. if (policy->governor_data)
  608. return -EBUSY;
  609. cpufreq_enable_fast_switch(policy);
  610. sg_policy = sugov_policy_alloc(policy);
  611. if (!sg_policy) {
  612. ret = -ENOMEM;
  613. goto disable_fast_switch;
  614. }
  615. ret = sugov_kthread_create(sg_policy);
  616. if (ret)
  617. goto free_sg_policy;
  618. mutex_lock(&global_tunables_lock);
  619. if (global_tunables) {
  620. if (WARN_ON(have_governor_per_policy())) {
  621. ret = -EINVAL;
  622. goto stop_kthread;
  623. }
  624. policy->governor_data = sg_policy;
  625. sg_policy->tunables = global_tunables;
  626. gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
  627. goto out;
  628. }
  629. tunables = sugov_tunables_alloc(sg_policy);
  630. if (!tunables) {
  631. ret = -ENOMEM;
  632. goto stop_kthread;
  633. }
  634. tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
  635. policy->governor_data = sg_policy;
  636. sg_policy->tunables = tunables;
  637. ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
  638. get_governor_parent_kobj(policy), "%s",
  639. schedutil_gov.name);
  640. if (ret)
  641. goto fail;
  642. out:
  643. sugov_eas_rebuild_sd();
  644. mutex_unlock(&global_tunables_lock);
  645. return 0;
  646. fail:
  647. kobject_put(&tunables->attr_set.kobj);
  648. policy->governor_data = NULL;
  649. sugov_clear_global_tunables();
  650. stop_kthread:
  651. sugov_kthread_stop(sg_policy);
  652. mutex_unlock(&global_tunables_lock);
  653. free_sg_policy:
  654. sugov_policy_free(sg_policy);
  655. disable_fast_switch:
  656. cpufreq_disable_fast_switch(policy);
  657. pr_err("initialization failed (error %d)\n", ret);
  658. return ret;
  659. }
  660. static void sugov_exit(struct cpufreq_policy *policy)
  661. {
  662. struct sugov_policy *sg_policy = policy->governor_data;
  663. struct sugov_tunables *tunables = sg_policy->tunables;
  664. unsigned int count;
  665. mutex_lock(&global_tunables_lock);
  666. count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
  667. policy->governor_data = NULL;
  668. if (!count)
  669. sugov_clear_global_tunables();
  670. mutex_unlock(&global_tunables_lock);
  671. sugov_kthread_stop(sg_policy);
  672. sugov_policy_free(sg_policy);
  673. cpufreq_disable_fast_switch(policy);
  674. sugov_eas_rebuild_sd();
  675. }
  676. static int sugov_start(struct cpufreq_policy *policy)
  677. {
  678. struct sugov_policy *sg_policy = policy->governor_data;
  679. void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
  680. unsigned int cpu;
  681. sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
  682. sg_policy->last_freq_update_time = 0;
  683. sg_policy->next_freq = 0;
  684. sg_policy->work_in_progress = false;
  685. sg_policy->limits_changed = false;
  686. sg_policy->cached_raw_freq = 0;
  687. sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
  688. if (policy_is_shared(policy))
  689. uu = sugov_update_shared;
  690. else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
  691. uu = sugov_update_single_perf;
  692. else
  693. uu = sugov_update_single_freq;
  694. for_each_cpu(cpu, policy->cpus) {
  695. struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
  696. memset(sg_cpu, 0, sizeof(*sg_cpu));
  697. sg_cpu->cpu = cpu;
  698. sg_cpu->sg_policy = sg_policy;
  699. cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
  700. }
  701. return 0;
  702. }
  703. static void sugov_stop(struct cpufreq_policy *policy)
  704. {
  705. struct sugov_policy *sg_policy = policy->governor_data;
  706. unsigned int cpu;
  707. for_each_cpu(cpu, policy->cpus)
  708. cpufreq_remove_update_util_hook(cpu);
  709. synchronize_rcu();
  710. if (!policy->fast_switch_enabled) {
  711. irq_work_sync(&sg_policy->irq_work);
  712. kthread_cancel_work_sync(&sg_policy->work);
  713. }
  714. }
  715. static void sugov_limits(struct cpufreq_policy *policy)
  716. {
  717. struct sugov_policy *sg_policy = policy->governor_data;
  718. if (!policy->fast_switch_enabled) {
  719. mutex_lock(&sg_policy->work_lock);
  720. cpufreq_policy_apply_limits(policy);
  721. mutex_unlock(&sg_policy->work_lock);
  722. }
  723. sg_policy->limits_changed = true;
  724. }
  725. struct cpufreq_governor schedutil_gov = {
  726. .name = "schedutil",
  727. .owner = THIS_MODULE,
  728. .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
  729. .init = sugov_init,
  730. .exit = sugov_exit,
  731. .start = sugov_start,
  732. .stop = sugov_stop,
  733. .limits = sugov_limits,
  734. };
  735. #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
  736. struct cpufreq_governor *cpufreq_default_governor(void)
  737. {
  738. return &schedutil_gov;
  739. }
  740. #endif
  741. cpufreq_governor_init(schedutil_gov);