blk-iocost.c 99 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556
  1. /* SPDX-License-Identifier: GPL-2.0
  2. *
  3. * IO cost model based controller.
  4. *
  5. * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
  6. * Copyright (C) 2019 Andy Newell <newella@fb.com>
  7. * Copyright (C) 2019 Facebook
  8. *
  9. * One challenge of controlling IO resources is the lack of trivially
  10. * observable cost metric. This is distinguished from CPU and memory where
  11. * wallclock time and the number of bytes can serve as accurate enough
  12. * approximations.
  13. *
  14. * Bandwidth and iops are the most commonly used metrics for IO devices but
  15. * depending on the type and specifics of the device, different IO patterns
  16. * easily lead to multiple orders of magnitude variations rendering them
  17. * useless for the purpose of IO capacity distribution. While on-device
  18. * time, with a lot of clutches, could serve as a useful approximation for
  19. * non-queued rotational devices, this is no longer viable with modern
  20. * devices, even the rotational ones.
  21. *
  22. * While there is no cost metric we can trivially observe, it isn't a
  23. * complete mystery. For example, on a rotational device, seek cost
  24. * dominates while a contiguous transfer contributes a smaller amount
  25. * proportional to the size. If we can characterize at least the relative
  26. * costs of these different types of IOs, it should be possible to
  27. * implement a reasonable work-conserving proportional IO resource
  28. * distribution.
  29. *
  30. * 1. IO Cost Model
  31. *
  32. * IO cost model estimates the cost of an IO given its basic parameters and
  33. * history (e.g. the end sector of the last IO). The cost is measured in
  34. * device time. If a given IO is estimated to cost 10ms, the device should
  35. * be able to process ~100 of those IOs in a second.
  36. *
  37. * Currently, there's only one builtin cost model - linear. Each IO is
  38. * classified as sequential or random and given a base cost accordingly.
  39. * On top of that, a size cost proportional to the length of the IO is
  40. * added. While simple, this model captures the operational
  41. * characteristics of a wide varienty of devices well enough. Default
  42. * parameters for several different classes of devices are provided and the
  43. * parameters can be configured from userspace via
  44. * /sys/fs/cgroup/io.cost.model.
  45. *
  46. * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47. * device-specific coefficients.
  48. *
  49. * 2. Control Strategy
  50. *
  51. * The device virtual time (vtime) is used as the primary control metric.
  52. * The control strategy is composed of the following three parts.
  53. *
  54. * 2-1. Vtime Distribution
  55. *
  56. * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57. * calculated. Please consider the following hierarchy where the numbers
  58. * inside parentheses denote the configured weights.
  59. *
  60. * root
  61. * / \
  62. * A (w:100) B (w:300)
  63. * / \
  64. * A0 (w:100) A1 (w:100)
  65. *
  66. * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67. * of equal weight, each gets 50% share. If then B starts issuing IOs, B
  68. * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69. * 12.5% each. The distribution mechanism only cares about these flattened
  70. * shares. They're called hweights (hierarchical weights) and always add
  71. * upto 1 (WEIGHT_ONE).
  72. *
  73. * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75. * against the device vtime - an IO which takes 10ms on the underlying
  76. * device is considered to take 80ms on A0.
  77. *
  78. * This constitutes the basis of IO capacity distribution. Each cgroup's
  79. * vtime is running at a rate determined by its hweight. A cgroup tracks
  80. * the vtime consumed by past IOs and can issue a new IO if doing so
  81. * wouldn't outrun the current device vtime. Otherwise, the IO is
  82. * suspended until the vtime has progressed enough to cover it.
  83. *
  84. * 2-2. Vrate Adjustment
  85. *
  86. * It's unrealistic to expect the cost model to be perfect. There are too
  87. * many devices and even on the same device the overall performance
  88. * fluctuates depending on numerous factors such as IO mixture and device
  89. * internal garbage collection. The controller needs to adapt dynamically.
  90. *
  91. * This is achieved by adjusting the overall IO rate according to how busy
  92. * the device is. If the device becomes overloaded, we're sending down too
  93. * many IOs and should generally slow down. If there are waiting issuers
  94. * but the device isn't saturated, we're issuing too few and should
  95. * generally speed up.
  96. *
  97. * To slow down, we lower the vrate - the rate at which the device vtime
  98. * passes compared to the wall clock. For example, if the vtime is running
  99. * at the vrate of 75%, all cgroups added up would only be able to issue
  100. * 750ms worth of IOs per second, and vice-versa for speeding up.
  101. *
  102. * Device business is determined using two criteria - rq wait and
  103. * completion latencies.
  104. *
  105. * When a device gets saturated, the on-device and then the request queues
  106. * fill up and a bio which is ready to be issued has to wait for a request
  107. * to become available. When this delay becomes noticeable, it's a clear
  108. * indication that the device is saturated and we lower the vrate. This
  109. * saturation signal is fairly conservative as it only triggers when both
  110. * hardware and software queues are filled up, and is used as the default
  111. * busy signal.
  112. *
  113. * As devices can have deep queues and be unfair in how the queued commands
  114. * are executed, solely depending on rq wait may not result in satisfactory
  115. * control quality. For a better control quality, completion latency QoS
  116. * parameters can be configured so that the device is considered saturated
  117. * if N'th percentile completion latency rises above the set point.
  118. *
  119. * The completion latency requirements are a function of both the
  120. * underlying device characteristics and the desired IO latency quality of
  121. * service. There is an inherent trade-off - the tighter the latency QoS,
  122. * the higher the bandwidth lossage. Latency QoS is disabled by default
  123. * and can be set through /sys/fs/cgroup/io.cost.qos.
  124. *
  125. * 2-3. Work Conservation
  126. *
  127. * Imagine two cgroups A and B with equal weights. A is issuing a small IO
  128. * periodically while B is sending out enough parallel IOs to saturate the
  129. * device on its own. Let's say A's usage amounts to 100ms worth of IO
  130. * cost per second, i.e., 10% of the device capacity. The naive
  131. * distribution of half and half would lead to 60% utilization of the
  132. * device, a significant reduction in the total amount of work done
  133. * compared to free-for-all competition. This is too high a cost to pay
  134. * for IO control.
  135. *
  136. * To conserve the total amount of work done, we keep track of how much
  137. * each active cgroup is actually using and yield part of its weight if
  138. * there are other cgroups which can make use of it. In the above case,
  139. * A's weight will be lowered so that it hovers above the actual usage and
  140. * B would be able to use the rest.
  141. *
  142. * As we don't want to penalize a cgroup for donating its weight, the
  143. * surplus weight adjustment factors in a margin and has an immediate
  144. * snapback mechanism in case the cgroup needs more IO vtime for itself.
  145. *
  146. * Note that adjusting down surplus weights has the same effects as
  147. * accelerating vtime for other cgroups and work conservation can also be
  148. * implemented by adjusting vrate dynamically. However, squaring who can
  149. * donate and should take back how much requires hweight propagations
  150. * anyway making it easier to implement and understand as a separate
  151. * mechanism.
  152. *
  153. * 3. Monitoring
  154. *
  155. * Instead of debugfs or other clumsy monitoring mechanisms, this
  156. * controller uses a drgn based monitoring script -
  157. * tools/cgroup/iocost_monitor.py. For details on drgn, please see
  158. * https://github.com/osandov/drgn. The output looks like the following.
  159. *
  160. * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
  161. * active weight hweight% inflt% dbt delay usages%
  162. * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
  163. * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
  164. *
  165. * - per : Timer period
  166. * - cur_per : Internal wall and device vtime clock
  167. * - vrate : Device virtual time rate against wall clock
  168. * - weight : Surplus-adjusted and configured weights
  169. * - hweight : Surplus-adjusted and configured hierarchical weights
  170. * - inflt : The percentage of in-flight IO cost at the end of last period
  171. * - del_ms : Deferred issuer delay induction level and duration
  172. * - usages : Usage history
  173. */
  174. #include <linux/kernel.h>
  175. #include <linux/module.h>
  176. #include <linux/timer.h>
  177. #include <linux/time64.h>
  178. #include <linux/parser.h>
  179. #include <linux/sched/signal.h>
  180. #include <asm/local.h>
  181. #include <asm/local64.h>
  182. #include "blk-rq-qos.h"
  183. #include "blk-stat.h"
  184. #include "blk-wbt.h"
  185. #include "blk-cgroup.h"
  186. #ifdef CONFIG_TRACEPOINTS
  187. /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
  188. #define TRACE_IOCG_PATH_LEN 1024
  189. static DEFINE_SPINLOCK(trace_iocg_path_lock);
  190. static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
  191. #define TRACE_IOCG_PATH(type, iocg, ...) \
  192. do { \
  193. unsigned long flags; \
  194. if (trace_iocost_##type##_enabled()) { \
  195. spin_lock_irqsave(&trace_iocg_path_lock, flags); \
  196. cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
  197. trace_iocg_path, TRACE_IOCG_PATH_LEN); \
  198. trace_iocost_##type(iocg, trace_iocg_path, \
  199. ##__VA_ARGS__); \
  200. spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
  201. } \
  202. } while (0)
  203. #else /* CONFIG_TRACE_POINTS */
  204. #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
  205. #endif /* CONFIG_TRACE_POINTS */
  206. enum {
  207. MILLION = 1000000,
  208. /* timer period is calculated from latency requirements, bound it */
  209. MIN_PERIOD = USEC_PER_MSEC,
  210. MAX_PERIOD = USEC_PER_SEC,
  211. /*
  212. * iocg->vtime is targeted at 50% behind the device vtime, which
  213. * serves as its IO credit buffer. Surplus weight adjustment is
  214. * immediately canceled if the vtime margin runs below 10%.
  215. */
  216. MARGIN_MIN_PCT = 10,
  217. MARGIN_LOW_PCT = 20,
  218. MARGIN_TARGET_PCT = 50,
  219. INUSE_ADJ_STEP_PCT = 25,
  220. /* Have some play in timer operations */
  221. TIMER_SLACK_PCT = 1,
  222. /* 1/64k is granular enough and can easily be handled w/ u32 */
  223. WEIGHT_ONE = 1 << 16,
  224. };
  225. enum {
  226. /*
  227. * As vtime is used to calculate the cost of each IO, it needs to
  228. * be fairly high precision. For example, it should be able to
  229. * represent the cost of a single page worth of discard with
  230. * suffificient accuracy. At the same time, it should be able to
  231. * represent reasonably long enough durations to be useful and
  232. * convenient during operation.
  233. *
  234. * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
  235. * granularity and days of wrap-around time even at extreme vrates.
  236. */
  237. VTIME_PER_SEC_SHIFT = 37,
  238. VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
  239. VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
  240. VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
  241. /* bound vrate adjustments within two orders of magnitude */
  242. VRATE_MIN_PPM = 10000, /* 1% */
  243. VRATE_MAX_PPM = 100000000, /* 10000% */
  244. VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
  245. VRATE_CLAMP_ADJ_PCT = 4,
  246. /* switch iff the conditions are met for longer than this */
  247. AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
  248. };
  249. enum {
  250. /* if IOs end up waiting for requests, issue less */
  251. RQ_WAIT_BUSY_PCT = 5,
  252. /* unbusy hysterisis */
  253. UNBUSY_THR_PCT = 75,
  254. /*
  255. * The effect of delay is indirect and non-linear and a huge amount of
  256. * future debt can accumulate abruptly while unthrottled. Linearly scale
  257. * up delay as debt is going up and then let it decay exponentially.
  258. * This gives us quick ramp ups while delay is accumulating and long
  259. * tails which can help reducing the frequency of debt explosions on
  260. * unthrottle. The parameters are experimentally determined.
  261. *
  262. * The delay mechanism provides adequate protection and behavior in many
  263. * cases. However, this is far from ideal and falls shorts on both
  264. * fronts. The debtors are often throttled too harshly costing a
  265. * significant level of fairness and possibly total work while the
  266. * protection against their impacts on the system can be choppy and
  267. * unreliable.
  268. *
  269. * The shortcoming primarily stems from the fact that, unlike for page
  270. * cache, the kernel doesn't have well-defined back-pressure propagation
  271. * mechanism and policies for anonymous memory. Fully addressing this
  272. * issue will likely require substantial improvements in the area.
  273. */
  274. MIN_DELAY_THR_PCT = 500,
  275. MAX_DELAY_THR_PCT = 25000,
  276. MIN_DELAY = 250,
  277. MAX_DELAY = 250 * USEC_PER_MSEC,
  278. /* halve debts if avg usage over 100ms is under 50% */
  279. DFGV_USAGE_PCT = 50,
  280. DFGV_PERIOD = 100 * USEC_PER_MSEC,
  281. /* don't let cmds which take a very long time pin lagging for too long */
  282. MAX_LAGGING_PERIODS = 10,
  283. /*
  284. * Count IO size in 4k pages. The 12bit shift helps keeping
  285. * size-proportional components of cost calculation in closer
  286. * numbers of digits to per-IO cost components.
  287. */
  288. IOC_PAGE_SHIFT = 12,
  289. IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
  290. IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
  291. /* if apart further than 16M, consider randio for linear model */
  292. LCOEF_RANDIO_PAGES = 4096,
  293. };
  294. enum ioc_running {
  295. IOC_IDLE,
  296. IOC_RUNNING,
  297. IOC_STOP,
  298. };
  299. /* io.cost.qos controls including per-dev enable of the whole controller */
  300. enum {
  301. QOS_ENABLE,
  302. QOS_CTRL,
  303. NR_QOS_CTRL_PARAMS,
  304. };
  305. /* io.cost.qos params */
  306. enum {
  307. QOS_RPPM,
  308. QOS_RLAT,
  309. QOS_WPPM,
  310. QOS_WLAT,
  311. QOS_MIN,
  312. QOS_MAX,
  313. NR_QOS_PARAMS,
  314. };
  315. /* io.cost.model controls */
  316. enum {
  317. COST_CTRL,
  318. COST_MODEL,
  319. NR_COST_CTRL_PARAMS,
  320. };
  321. /* builtin linear cost model coefficients */
  322. enum {
  323. I_LCOEF_RBPS,
  324. I_LCOEF_RSEQIOPS,
  325. I_LCOEF_RRANDIOPS,
  326. I_LCOEF_WBPS,
  327. I_LCOEF_WSEQIOPS,
  328. I_LCOEF_WRANDIOPS,
  329. NR_I_LCOEFS,
  330. };
  331. enum {
  332. LCOEF_RPAGE,
  333. LCOEF_RSEQIO,
  334. LCOEF_RRANDIO,
  335. LCOEF_WPAGE,
  336. LCOEF_WSEQIO,
  337. LCOEF_WRANDIO,
  338. NR_LCOEFS,
  339. };
  340. enum {
  341. AUTOP_INVALID,
  342. AUTOP_HDD,
  343. AUTOP_SSD_QD1,
  344. AUTOP_SSD_DFL,
  345. AUTOP_SSD_FAST,
  346. };
  347. struct ioc_params {
  348. u32 qos[NR_QOS_PARAMS];
  349. u64 i_lcoefs[NR_I_LCOEFS];
  350. u64 lcoefs[NR_LCOEFS];
  351. u32 too_fast_vrate_pct;
  352. u32 too_slow_vrate_pct;
  353. };
  354. struct ioc_margins {
  355. s64 min;
  356. s64 low;
  357. s64 target;
  358. };
  359. struct ioc_missed {
  360. local_t nr_met;
  361. local_t nr_missed;
  362. u32 last_met;
  363. u32 last_missed;
  364. };
  365. struct ioc_pcpu_stat {
  366. struct ioc_missed missed[2];
  367. local64_t rq_wait_ns;
  368. u64 last_rq_wait_ns;
  369. };
  370. /* per device */
  371. struct ioc {
  372. struct rq_qos rqos;
  373. bool enabled;
  374. struct ioc_params params;
  375. struct ioc_margins margins;
  376. u32 period_us;
  377. u32 timer_slack_ns;
  378. u64 vrate_min;
  379. u64 vrate_max;
  380. spinlock_t lock;
  381. struct timer_list timer;
  382. struct list_head active_iocgs; /* active cgroups */
  383. struct ioc_pcpu_stat __percpu *pcpu_stat;
  384. enum ioc_running running;
  385. atomic64_t vtime_rate;
  386. u64 vtime_base_rate;
  387. s64 vtime_err;
  388. seqcount_spinlock_t period_seqcount;
  389. u64 period_at; /* wallclock starttime */
  390. u64 period_at_vtime; /* vtime starttime */
  391. atomic64_t cur_period; /* inc'd each period */
  392. int busy_level; /* saturation history */
  393. bool weights_updated;
  394. atomic_t hweight_gen; /* for lazy hweights */
  395. /* debt forgivness */
  396. u64 dfgv_period_at;
  397. u64 dfgv_period_rem;
  398. u64 dfgv_usage_us_sum;
  399. u64 autop_too_fast_at;
  400. u64 autop_too_slow_at;
  401. int autop_idx;
  402. bool user_qos_params:1;
  403. bool user_cost_model:1;
  404. };
  405. struct iocg_pcpu_stat {
  406. local64_t abs_vusage;
  407. };
  408. struct iocg_stat {
  409. u64 usage_us;
  410. u64 wait_us;
  411. u64 indebt_us;
  412. u64 indelay_us;
  413. };
  414. /* per device-cgroup pair */
  415. struct ioc_gq {
  416. struct blkg_policy_data pd;
  417. struct ioc *ioc;
  418. /*
  419. * A iocg can get its weight from two sources - an explicit
  420. * per-device-cgroup configuration or the default weight of the
  421. * cgroup. `cfg_weight` is the explicit per-device-cgroup
  422. * configuration. `weight` is the effective considering both
  423. * sources.
  424. *
  425. * When an idle cgroup becomes active its `active` goes from 0 to
  426. * `weight`. `inuse` is the surplus adjusted active weight.
  427. * `active` and `inuse` are used to calculate `hweight_active` and
  428. * `hweight_inuse`.
  429. *
  430. * `last_inuse` remembers `inuse` while an iocg is idle to persist
  431. * surplus adjustments.
  432. *
  433. * `inuse` may be adjusted dynamically during period. `saved_*` are used
  434. * to determine and track adjustments.
  435. */
  436. u32 cfg_weight;
  437. u32 weight;
  438. u32 active;
  439. u32 inuse;
  440. u32 last_inuse;
  441. s64 saved_margin;
  442. sector_t cursor; /* to detect randio */
  443. /*
  444. * `vtime` is this iocg's vtime cursor which progresses as IOs are
  445. * issued. If lagging behind device vtime, the delta represents
  446. * the currently available IO budget. If running ahead, the
  447. * overage.
  448. *
  449. * `vtime_done` is the same but progressed on completion rather
  450. * than issue. The delta behind `vtime` represents the cost of
  451. * currently in-flight IOs.
  452. */
  453. atomic64_t vtime;
  454. atomic64_t done_vtime;
  455. u64 abs_vdebt;
  456. /* current delay in effect and when it started */
  457. u64 delay;
  458. u64 delay_at;
  459. /*
  460. * The period this iocg was last active in. Used for deactivation
  461. * and invalidating `vtime`.
  462. */
  463. atomic64_t active_period;
  464. struct list_head active_list;
  465. /* see __propagate_weights() and current_hweight() for details */
  466. u64 child_active_sum;
  467. u64 child_inuse_sum;
  468. u64 child_adjusted_sum;
  469. int hweight_gen;
  470. u32 hweight_active;
  471. u32 hweight_inuse;
  472. u32 hweight_donating;
  473. u32 hweight_after_donation;
  474. struct list_head walk_list;
  475. struct list_head surplus_list;
  476. struct wait_queue_head waitq;
  477. struct hrtimer waitq_timer;
  478. /* timestamp at the latest activation */
  479. u64 activated_at;
  480. /* statistics */
  481. struct iocg_pcpu_stat __percpu *pcpu_stat;
  482. struct iocg_stat stat;
  483. struct iocg_stat last_stat;
  484. u64 last_stat_abs_vusage;
  485. u64 usage_delta_us;
  486. u64 wait_since;
  487. u64 indebt_since;
  488. u64 indelay_since;
  489. /* this iocg's depth in the hierarchy and ancestors including self */
  490. int level;
  491. struct ioc_gq *ancestors[];
  492. };
  493. /* per cgroup */
  494. struct ioc_cgrp {
  495. struct blkcg_policy_data cpd;
  496. unsigned int dfl_weight;
  497. };
  498. struct ioc_now {
  499. u64 now_ns;
  500. u64 now;
  501. u64 vnow;
  502. };
  503. struct iocg_wait {
  504. struct wait_queue_entry wait;
  505. struct bio *bio;
  506. u64 abs_cost;
  507. bool committed;
  508. };
  509. struct iocg_wake_ctx {
  510. struct ioc_gq *iocg;
  511. u32 hw_inuse;
  512. s64 vbudget;
  513. };
  514. static const struct ioc_params autop[] = {
  515. [AUTOP_HDD] = {
  516. .qos = {
  517. [QOS_RLAT] = 250000, /* 250ms */
  518. [QOS_WLAT] = 250000,
  519. [QOS_MIN] = VRATE_MIN_PPM,
  520. [QOS_MAX] = VRATE_MAX_PPM,
  521. },
  522. .i_lcoefs = {
  523. [I_LCOEF_RBPS] = 174019176,
  524. [I_LCOEF_RSEQIOPS] = 41708,
  525. [I_LCOEF_RRANDIOPS] = 370,
  526. [I_LCOEF_WBPS] = 178075866,
  527. [I_LCOEF_WSEQIOPS] = 42705,
  528. [I_LCOEF_WRANDIOPS] = 378,
  529. },
  530. },
  531. [AUTOP_SSD_QD1] = {
  532. .qos = {
  533. [QOS_RLAT] = 25000, /* 25ms */
  534. [QOS_WLAT] = 25000,
  535. [QOS_MIN] = VRATE_MIN_PPM,
  536. [QOS_MAX] = VRATE_MAX_PPM,
  537. },
  538. .i_lcoefs = {
  539. [I_LCOEF_RBPS] = 245855193,
  540. [I_LCOEF_RSEQIOPS] = 61575,
  541. [I_LCOEF_RRANDIOPS] = 6946,
  542. [I_LCOEF_WBPS] = 141365009,
  543. [I_LCOEF_WSEQIOPS] = 33716,
  544. [I_LCOEF_WRANDIOPS] = 26796,
  545. },
  546. },
  547. [AUTOP_SSD_DFL] = {
  548. .qos = {
  549. [QOS_RLAT] = 25000, /* 25ms */
  550. [QOS_WLAT] = 25000,
  551. [QOS_MIN] = VRATE_MIN_PPM,
  552. [QOS_MAX] = VRATE_MAX_PPM,
  553. },
  554. .i_lcoefs = {
  555. [I_LCOEF_RBPS] = 488636629,
  556. [I_LCOEF_RSEQIOPS] = 8932,
  557. [I_LCOEF_RRANDIOPS] = 8518,
  558. [I_LCOEF_WBPS] = 427891549,
  559. [I_LCOEF_WSEQIOPS] = 28755,
  560. [I_LCOEF_WRANDIOPS] = 21940,
  561. },
  562. .too_fast_vrate_pct = 500,
  563. },
  564. [AUTOP_SSD_FAST] = {
  565. .qos = {
  566. [QOS_RLAT] = 5000, /* 5ms */
  567. [QOS_WLAT] = 5000,
  568. [QOS_MIN] = VRATE_MIN_PPM,
  569. [QOS_MAX] = VRATE_MAX_PPM,
  570. },
  571. .i_lcoefs = {
  572. [I_LCOEF_RBPS] = 3102524156LLU,
  573. [I_LCOEF_RSEQIOPS] = 724816,
  574. [I_LCOEF_RRANDIOPS] = 778122,
  575. [I_LCOEF_WBPS] = 1742780862LLU,
  576. [I_LCOEF_WSEQIOPS] = 425702,
  577. [I_LCOEF_WRANDIOPS] = 443193,
  578. },
  579. .too_slow_vrate_pct = 10,
  580. },
  581. };
  582. /*
  583. * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
  584. * vtime credit shortage and down on device saturation.
  585. */
  586. static const u32 vrate_adj_pct[] =
  587. { 0, 0, 0, 0,
  588. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  589. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  590. 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
  591. static struct blkcg_policy blkcg_policy_iocost;
  592. /* accessors and helpers */
  593. static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
  594. {
  595. return container_of(rqos, struct ioc, rqos);
  596. }
  597. static struct ioc *q_to_ioc(struct request_queue *q)
  598. {
  599. return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
  600. }
  601. static const char __maybe_unused *ioc_name(struct ioc *ioc)
  602. {
  603. struct gendisk *disk = ioc->rqos.disk;
  604. if (!disk)
  605. return "<unknown>";
  606. return disk->disk_name;
  607. }
  608. static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
  609. {
  610. return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
  611. }
  612. static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
  613. {
  614. return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
  615. }
  616. static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
  617. {
  618. return pd_to_blkg(&iocg->pd);
  619. }
  620. static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
  621. {
  622. return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
  623. struct ioc_cgrp, cpd);
  624. }
  625. /*
  626. * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
  627. * weight, the more expensive each IO. Must round up.
  628. */
  629. static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
  630. {
  631. return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
  632. }
  633. /*
  634. * The inverse of abs_cost_to_cost(). Must round up.
  635. */
  636. static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
  637. {
  638. return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
  639. }
  640. static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
  641. u64 abs_cost, u64 cost)
  642. {
  643. struct iocg_pcpu_stat *gcs;
  644. bio->bi_iocost_cost = cost;
  645. atomic64_add(cost, &iocg->vtime);
  646. gcs = get_cpu_ptr(iocg->pcpu_stat);
  647. local64_add(abs_cost, &gcs->abs_vusage);
  648. put_cpu_ptr(gcs);
  649. }
  650. static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
  651. {
  652. if (lock_ioc) {
  653. spin_lock_irqsave(&iocg->ioc->lock, *flags);
  654. spin_lock(&iocg->waitq.lock);
  655. } else {
  656. spin_lock_irqsave(&iocg->waitq.lock, *flags);
  657. }
  658. }
  659. static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
  660. {
  661. if (unlock_ioc) {
  662. spin_unlock(&iocg->waitq.lock);
  663. spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
  664. } else {
  665. spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
  666. }
  667. }
  668. #define CREATE_TRACE_POINTS
  669. #include <trace/events/iocost.h>
  670. static void ioc_refresh_margins(struct ioc *ioc)
  671. {
  672. struct ioc_margins *margins = &ioc->margins;
  673. u32 period_us = ioc->period_us;
  674. u64 vrate = ioc->vtime_base_rate;
  675. margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
  676. margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
  677. margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
  678. }
  679. /* latency Qos params changed, update period_us and all the dependent params */
  680. static void ioc_refresh_period_us(struct ioc *ioc)
  681. {
  682. u32 ppm, lat, multi, period_us;
  683. lockdep_assert_held(&ioc->lock);
  684. /* pick the higher latency target */
  685. if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
  686. ppm = ioc->params.qos[QOS_RPPM];
  687. lat = ioc->params.qos[QOS_RLAT];
  688. } else {
  689. ppm = ioc->params.qos[QOS_WPPM];
  690. lat = ioc->params.qos[QOS_WLAT];
  691. }
  692. /*
  693. * We want the period to be long enough to contain a healthy number
  694. * of IOs while short enough for granular control. Define it as a
  695. * multiple of the latency target. Ideally, the multiplier should
  696. * be scaled according to the percentile so that it would nominally
  697. * contain a certain number of requests. Let's be simpler and
  698. * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
  699. */
  700. if (ppm)
  701. multi = max_t(u32, (MILLION - ppm) / 50000, 2);
  702. else
  703. multi = 2;
  704. period_us = multi * lat;
  705. period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
  706. /* calculate dependent params */
  707. ioc->period_us = period_us;
  708. ioc->timer_slack_ns = div64_u64(
  709. (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
  710. 100);
  711. ioc_refresh_margins(ioc);
  712. }
  713. /*
  714. * ioc->rqos.disk isn't initialized when this function is called from
  715. * the init path.
  716. */
  717. static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
  718. {
  719. int idx = ioc->autop_idx;
  720. const struct ioc_params *p = &autop[idx];
  721. u32 vrate_pct;
  722. u64 now_ns;
  723. /* rotational? */
  724. if (!blk_queue_nonrot(disk->queue))
  725. return AUTOP_HDD;
  726. /* handle SATA SSDs w/ broken NCQ */
  727. if (blk_queue_depth(disk->queue) == 1)
  728. return AUTOP_SSD_QD1;
  729. /* use one of the normal ssd sets */
  730. if (idx < AUTOP_SSD_DFL)
  731. return AUTOP_SSD_DFL;
  732. /* if user is overriding anything, maintain what was there */
  733. if (ioc->user_qos_params || ioc->user_cost_model)
  734. return idx;
  735. /* step up/down based on the vrate */
  736. vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
  737. now_ns = blk_time_get_ns();
  738. if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
  739. if (!ioc->autop_too_fast_at)
  740. ioc->autop_too_fast_at = now_ns;
  741. if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
  742. return idx + 1;
  743. } else {
  744. ioc->autop_too_fast_at = 0;
  745. }
  746. if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
  747. if (!ioc->autop_too_slow_at)
  748. ioc->autop_too_slow_at = now_ns;
  749. if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
  750. return idx - 1;
  751. } else {
  752. ioc->autop_too_slow_at = 0;
  753. }
  754. return idx;
  755. }
  756. /*
  757. * Take the followings as input
  758. *
  759. * @bps maximum sequential throughput
  760. * @seqiops maximum sequential 4k iops
  761. * @randiops maximum random 4k iops
  762. *
  763. * and calculate the linear model cost coefficients.
  764. *
  765. * *@page per-page cost 1s / (@bps / 4096)
  766. * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
  767. * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
  768. */
  769. static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
  770. u64 *page, u64 *seqio, u64 *randio)
  771. {
  772. u64 v;
  773. *page = *seqio = *randio = 0;
  774. if (bps) {
  775. u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
  776. if (bps_pages)
  777. *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
  778. else
  779. *page = 1;
  780. }
  781. if (seqiops) {
  782. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
  783. if (v > *page)
  784. *seqio = v - *page;
  785. }
  786. if (randiops) {
  787. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
  788. if (v > *page)
  789. *randio = v - *page;
  790. }
  791. }
  792. static void ioc_refresh_lcoefs(struct ioc *ioc)
  793. {
  794. u64 *u = ioc->params.i_lcoefs;
  795. u64 *c = ioc->params.lcoefs;
  796. calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  797. &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
  798. calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
  799. &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
  800. }
  801. /*
  802. * struct gendisk is required as an argument because ioc->rqos.disk
  803. * is not properly initialized when called from the init path.
  804. */
  805. static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
  806. struct gendisk *disk)
  807. {
  808. const struct ioc_params *p;
  809. int idx;
  810. lockdep_assert_held(&ioc->lock);
  811. idx = ioc_autop_idx(ioc, disk);
  812. p = &autop[idx];
  813. if (idx == ioc->autop_idx && !force)
  814. return false;
  815. if (idx != ioc->autop_idx) {
  816. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  817. ioc->vtime_base_rate = VTIME_PER_USEC;
  818. }
  819. ioc->autop_idx = idx;
  820. ioc->autop_too_fast_at = 0;
  821. ioc->autop_too_slow_at = 0;
  822. if (!ioc->user_qos_params)
  823. memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
  824. if (!ioc->user_cost_model)
  825. memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
  826. ioc_refresh_period_us(ioc);
  827. ioc_refresh_lcoefs(ioc);
  828. ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
  829. VTIME_PER_USEC, MILLION);
  830. ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
  831. VTIME_PER_USEC, MILLION);
  832. return true;
  833. }
  834. static bool ioc_refresh_params(struct ioc *ioc, bool force)
  835. {
  836. return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
  837. }
  838. /*
  839. * When an iocg accumulates too much vtime or gets deactivated, we throw away
  840. * some vtime, which lowers the overall device utilization. As the exact amount
  841. * which is being thrown away is known, we can compensate by accelerating the
  842. * vrate accordingly so that the extra vtime generated in the current period
  843. * matches what got lost.
  844. */
  845. static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
  846. {
  847. s64 pleft = ioc->period_at + ioc->period_us - now->now;
  848. s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
  849. s64 vcomp, vcomp_min, vcomp_max;
  850. lockdep_assert_held(&ioc->lock);
  851. /* we need some time left in this period */
  852. if (pleft <= 0)
  853. goto done;
  854. /*
  855. * Calculate how much vrate should be adjusted to offset the error.
  856. * Limit the amount of adjustment and deduct the adjusted amount from
  857. * the error.
  858. */
  859. vcomp = -div64_s64(ioc->vtime_err, pleft);
  860. vcomp_min = -(ioc->vtime_base_rate >> 1);
  861. vcomp_max = ioc->vtime_base_rate;
  862. vcomp = clamp(vcomp, vcomp_min, vcomp_max);
  863. ioc->vtime_err += vcomp * pleft;
  864. atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
  865. done:
  866. /* bound how much error can accumulate */
  867. ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
  868. }
  869. static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
  870. int nr_lagging, int nr_shortages,
  871. int prev_busy_level, u32 *missed_ppm)
  872. {
  873. u64 vrate = ioc->vtime_base_rate;
  874. u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
  875. if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
  876. if (ioc->busy_level != prev_busy_level || nr_lagging)
  877. trace_iocost_ioc_vrate_adj(ioc, vrate,
  878. missed_ppm, rq_wait_pct,
  879. nr_lagging, nr_shortages);
  880. return;
  881. }
  882. /*
  883. * If vrate is out of bounds, apply clamp gradually as the
  884. * bounds can change abruptly. Otherwise, apply busy_level
  885. * based adjustment.
  886. */
  887. if (vrate < vrate_min) {
  888. vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
  889. vrate = min(vrate, vrate_min);
  890. } else if (vrate > vrate_max) {
  891. vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
  892. vrate = max(vrate, vrate_max);
  893. } else {
  894. int idx = min_t(int, abs(ioc->busy_level),
  895. ARRAY_SIZE(vrate_adj_pct) - 1);
  896. u32 adj_pct = vrate_adj_pct[idx];
  897. if (ioc->busy_level > 0)
  898. adj_pct = 100 - adj_pct;
  899. else
  900. adj_pct = 100 + adj_pct;
  901. vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
  902. vrate_min, vrate_max);
  903. }
  904. trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
  905. nr_lagging, nr_shortages);
  906. ioc->vtime_base_rate = vrate;
  907. ioc_refresh_margins(ioc);
  908. }
  909. /* take a snapshot of the current [v]time and vrate */
  910. static void ioc_now(struct ioc *ioc, struct ioc_now *now)
  911. {
  912. unsigned seq;
  913. u64 vrate;
  914. now->now_ns = blk_time_get_ns();
  915. now->now = ktime_to_us(now->now_ns);
  916. vrate = atomic64_read(&ioc->vtime_rate);
  917. /*
  918. * The current vtime is
  919. *
  920. * vtime at period start + (wallclock time since the start) * vrate
  921. *
  922. * As a consistent snapshot of `period_at_vtime` and `period_at` is
  923. * needed, they're seqcount protected.
  924. */
  925. do {
  926. seq = read_seqcount_begin(&ioc->period_seqcount);
  927. now->vnow = ioc->period_at_vtime +
  928. (now->now - ioc->period_at) * vrate;
  929. } while (read_seqcount_retry(&ioc->period_seqcount, seq));
  930. }
  931. static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
  932. {
  933. WARN_ON_ONCE(ioc->running != IOC_RUNNING);
  934. write_seqcount_begin(&ioc->period_seqcount);
  935. ioc->period_at = now->now;
  936. ioc->period_at_vtime = now->vnow;
  937. write_seqcount_end(&ioc->period_seqcount);
  938. ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
  939. add_timer(&ioc->timer);
  940. }
  941. /*
  942. * Update @iocg's `active` and `inuse` to @active and @inuse, update level
  943. * weight sums and propagate upwards accordingly. If @save, the current margin
  944. * is saved to be used as reference for later inuse in-period adjustments.
  945. */
  946. static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  947. bool save, struct ioc_now *now)
  948. {
  949. struct ioc *ioc = iocg->ioc;
  950. int lvl;
  951. lockdep_assert_held(&ioc->lock);
  952. /*
  953. * For an active leaf node, its inuse shouldn't be zero or exceed
  954. * @active. An active internal node's inuse is solely determined by the
  955. * inuse to active ratio of its children regardless of @inuse.
  956. */
  957. if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
  958. inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
  959. iocg->child_active_sum);
  960. } else {
  961. /*
  962. * It may be tempting to turn this into a clamp expression with
  963. * a lower limit of 1 but active may be 0, which cannot be used
  964. * as an upper limit in that situation. This expression allows
  965. * active to clamp inuse unless it is 0, in which case inuse
  966. * becomes 1.
  967. */
  968. inuse = min(inuse, active) ?: 1;
  969. }
  970. iocg->last_inuse = iocg->inuse;
  971. if (save)
  972. iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
  973. if (active == iocg->active && inuse == iocg->inuse)
  974. return;
  975. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  976. struct ioc_gq *parent = iocg->ancestors[lvl];
  977. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  978. u32 parent_active = 0, parent_inuse = 0;
  979. /* update the level sums */
  980. parent->child_active_sum += (s32)(active - child->active);
  981. parent->child_inuse_sum += (s32)(inuse - child->inuse);
  982. /* apply the updates */
  983. child->active = active;
  984. child->inuse = inuse;
  985. /*
  986. * The delta between inuse and active sums indicates that
  987. * much of weight is being given away. Parent's inuse
  988. * and active should reflect the ratio.
  989. */
  990. if (parent->child_active_sum) {
  991. parent_active = parent->weight;
  992. parent_inuse = DIV64_U64_ROUND_UP(
  993. parent_active * parent->child_inuse_sum,
  994. parent->child_active_sum);
  995. }
  996. /* do we need to keep walking up? */
  997. if (parent_active == parent->active &&
  998. parent_inuse == parent->inuse)
  999. break;
  1000. active = parent_active;
  1001. inuse = parent_inuse;
  1002. }
  1003. ioc->weights_updated = true;
  1004. }
  1005. static void commit_weights(struct ioc *ioc)
  1006. {
  1007. lockdep_assert_held(&ioc->lock);
  1008. if (ioc->weights_updated) {
  1009. /* paired with rmb in current_hweight(), see there */
  1010. smp_wmb();
  1011. atomic_inc(&ioc->hweight_gen);
  1012. ioc->weights_updated = false;
  1013. }
  1014. }
  1015. static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  1016. bool save, struct ioc_now *now)
  1017. {
  1018. __propagate_weights(iocg, active, inuse, save, now);
  1019. commit_weights(iocg->ioc);
  1020. }
  1021. static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
  1022. {
  1023. struct ioc *ioc = iocg->ioc;
  1024. int lvl;
  1025. u32 hwa, hwi;
  1026. int ioc_gen;
  1027. /* hot path - if uptodate, use cached */
  1028. ioc_gen = atomic_read(&ioc->hweight_gen);
  1029. if (ioc_gen == iocg->hweight_gen)
  1030. goto out;
  1031. /*
  1032. * Paired with wmb in commit_weights(). If we saw the updated
  1033. * hweight_gen, all the weight updates from __propagate_weights() are
  1034. * visible too.
  1035. *
  1036. * We can race with weight updates during calculation and get it
  1037. * wrong. However, hweight_gen would have changed and a future
  1038. * reader will recalculate and we're guaranteed to discard the
  1039. * wrong result soon.
  1040. */
  1041. smp_rmb();
  1042. hwa = hwi = WEIGHT_ONE;
  1043. for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
  1044. struct ioc_gq *parent = iocg->ancestors[lvl];
  1045. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1046. u64 active_sum = READ_ONCE(parent->child_active_sum);
  1047. u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
  1048. u32 active = READ_ONCE(child->active);
  1049. u32 inuse = READ_ONCE(child->inuse);
  1050. /* we can race with deactivations and either may read as zero */
  1051. if (!active_sum || !inuse_sum)
  1052. continue;
  1053. active_sum = max_t(u64, active, active_sum);
  1054. hwa = div64_u64((u64)hwa * active, active_sum);
  1055. inuse_sum = max_t(u64, inuse, inuse_sum);
  1056. hwi = div64_u64((u64)hwi * inuse, inuse_sum);
  1057. }
  1058. iocg->hweight_active = max_t(u32, hwa, 1);
  1059. iocg->hweight_inuse = max_t(u32, hwi, 1);
  1060. iocg->hweight_gen = ioc_gen;
  1061. out:
  1062. if (hw_activep)
  1063. *hw_activep = iocg->hweight_active;
  1064. if (hw_inusep)
  1065. *hw_inusep = iocg->hweight_inuse;
  1066. }
  1067. /*
  1068. * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
  1069. * other weights stay unchanged.
  1070. */
  1071. static u32 current_hweight_max(struct ioc_gq *iocg)
  1072. {
  1073. u32 hwm = WEIGHT_ONE;
  1074. u32 inuse = iocg->active;
  1075. u64 child_inuse_sum;
  1076. int lvl;
  1077. lockdep_assert_held(&iocg->ioc->lock);
  1078. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1079. struct ioc_gq *parent = iocg->ancestors[lvl];
  1080. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1081. child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
  1082. hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
  1083. inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
  1084. parent->child_active_sum);
  1085. }
  1086. return max_t(u32, hwm, 1);
  1087. }
  1088. static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
  1089. {
  1090. struct ioc *ioc = iocg->ioc;
  1091. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1092. struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
  1093. u32 weight;
  1094. lockdep_assert_held(&ioc->lock);
  1095. weight = iocg->cfg_weight ?: iocc->dfl_weight;
  1096. if (weight != iocg->weight && iocg->active)
  1097. propagate_weights(iocg, weight, iocg->inuse, true, now);
  1098. iocg->weight = weight;
  1099. }
  1100. static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
  1101. {
  1102. struct ioc *ioc = iocg->ioc;
  1103. u64 __maybe_unused last_period, cur_period;
  1104. u64 vtime, vtarget;
  1105. int i;
  1106. /*
  1107. * If seem to be already active, just update the stamp to tell the
  1108. * timer that we're still active. We don't mind occassional races.
  1109. */
  1110. if (!list_empty(&iocg->active_list)) {
  1111. ioc_now(ioc, now);
  1112. cur_period = atomic64_read(&ioc->cur_period);
  1113. if (atomic64_read(&iocg->active_period) != cur_period)
  1114. atomic64_set(&iocg->active_period, cur_period);
  1115. return true;
  1116. }
  1117. /* racy check on internal node IOs, treat as root level IOs */
  1118. if (iocg->child_active_sum)
  1119. return false;
  1120. spin_lock_irq(&ioc->lock);
  1121. ioc_now(ioc, now);
  1122. /* update period */
  1123. cur_period = atomic64_read(&ioc->cur_period);
  1124. last_period = atomic64_read(&iocg->active_period);
  1125. atomic64_set(&iocg->active_period, cur_period);
  1126. /* already activated or breaking leaf-only constraint? */
  1127. if (!list_empty(&iocg->active_list))
  1128. goto succeed_unlock;
  1129. for (i = iocg->level - 1; i > 0; i--)
  1130. if (!list_empty(&iocg->ancestors[i]->active_list))
  1131. goto fail_unlock;
  1132. if (iocg->child_active_sum)
  1133. goto fail_unlock;
  1134. /*
  1135. * Always start with the target budget. On deactivation, we throw away
  1136. * anything above it.
  1137. */
  1138. vtarget = now->vnow - ioc->margins.target;
  1139. vtime = atomic64_read(&iocg->vtime);
  1140. atomic64_add(vtarget - vtime, &iocg->vtime);
  1141. atomic64_add(vtarget - vtime, &iocg->done_vtime);
  1142. vtime = vtarget;
  1143. /*
  1144. * Activate, propagate weight and start period timer if not
  1145. * running. Reset hweight_gen to avoid accidental match from
  1146. * wrapping.
  1147. */
  1148. iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
  1149. list_add(&iocg->active_list, &ioc->active_iocgs);
  1150. propagate_weights(iocg, iocg->weight,
  1151. iocg->last_inuse ?: iocg->weight, true, now);
  1152. TRACE_IOCG_PATH(iocg_activate, iocg, now,
  1153. last_period, cur_period, vtime);
  1154. iocg->activated_at = now->now;
  1155. if (ioc->running == IOC_IDLE) {
  1156. ioc->running = IOC_RUNNING;
  1157. ioc->dfgv_period_at = now->now;
  1158. ioc->dfgv_period_rem = 0;
  1159. ioc_start_period(ioc, now);
  1160. }
  1161. succeed_unlock:
  1162. spin_unlock_irq(&ioc->lock);
  1163. return true;
  1164. fail_unlock:
  1165. spin_unlock_irq(&ioc->lock);
  1166. return false;
  1167. }
  1168. static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
  1169. {
  1170. struct ioc *ioc = iocg->ioc;
  1171. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1172. u64 tdelta, delay, new_delay, shift;
  1173. s64 vover, vover_pct;
  1174. u32 hwa;
  1175. lockdep_assert_held(&iocg->waitq.lock);
  1176. /*
  1177. * If the delay is set by another CPU, we may be in the past. No need to
  1178. * change anything if so. This avoids decay calculation underflow.
  1179. */
  1180. if (time_before64(now->now, iocg->delay_at))
  1181. return false;
  1182. /* calculate the current delay in effect - 1/2 every second */
  1183. tdelta = now->now - iocg->delay_at;
  1184. shift = div64_u64(tdelta, USEC_PER_SEC);
  1185. if (iocg->delay && shift < BITS_PER_LONG)
  1186. delay = iocg->delay >> shift;
  1187. else
  1188. delay = 0;
  1189. /* calculate the new delay from the debt amount */
  1190. current_hweight(iocg, &hwa, NULL);
  1191. vover = atomic64_read(&iocg->vtime) +
  1192. abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
  1193. vover_pct = div64_s64(100 * vover,
  1194. ioc->period_us * ioc->vtime_base_rate);
  1195. if (vover_pct <= MIN_DELAY_THR_PCT)
  1196. new_delay = 0;
  1197. else if (vover_pct >= MAX_DELAY_THR_PCT)
  1198. new_delay = MAX_DELAY;
  1199. else
  1200. new_delay = MIN_DELAY +
  1201. div_u64((MAX_DELAY - MIN_DELAY) *
  1202. (vover_pct - MIN_DELAY_THR_PCT),
  1203. MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
  1204. /* pick the higher one and apply */
  1205. if (new_delay > delay) {
  1206. iocg->delay = new_delay;
  1207. iocg->delay_at = now->now;
  1208. delay = new_delay;
  1209. }
  1210. if (delay >= MIN_DELAY) {
  1211. if (!iocg->indelay_since)
  1212. iocg->indelay_since = now->now;
  1213. blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
  1214. return true;
  1215. } else {
  1216. if (iocg->indelay_since) {
  1217. iocg->stat.indelay_us += now->now - iocg->indelay_since;
  1218. iocg->indelay_since = 0;
  1219. }
  1220. iocg->delay = 0;
  1221. blkcg_clear_delay(blkg);
  1222. return false;
  1223. }
  1224. }
  1225. static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
  1226. struct ioc_now *now)
  1227. {
  1228. struct iocg_pcpu_stat *gcs;
  1229. lockdep_assert_held(&iocg->ioc->lock);
  1230. lockdep_assert_held(&iocg->waitq.lock);
  1231. WARN_ON_ONCE(list_empty(&iocg->active_list));
  1232. /*
  1233. * Once in debt, debt handling owns inuse. @iocg stays at the minimum
  1234. * inuse donating all of it share to others until its debt is paid off.
  1235. */
  1236. if (!iocg->abs_vdebt && abs_cost) {
  1237. iocg->indebt_since = now->now;
  1238. propagate_weights(iocg, iocg->active, 0, false, now);
  1239. }
  1240. iocg->abs_vdebt += abs_cost;
  1241. gcs = get_cpu_ptr(iocg->pcpu_stat);
  1242. local64_add(abs_cost, &gcs->abs_vusage);
  1243. put_cpu_ptr(gcs);
  1244. }
  1245. static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
  1246. struct ioc_now *now)
  1247. {
  1248. lockdep_assert_held(&iocg->ioc->lock);
  1249. lockdep_assert_held(&iocg->waitq.lock);
  1250. /*
  1251. * make sure that nobody messed with @iocg. Check iocg->pd.online
  1252. * to avoid warn when removing blkcg or disk.
  1253. */
  1254. WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online);
  1255. WARN_ON_ONCE(iocg->inuse > 1);
  1256. iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
  1257. /* if debt is paid in full, restore inuse */
  1258. if (!iocg->abs_vdebt) {
  1259. iocg->stat.indebt_us += now->now - iocg->indebt_since;
  1260. iocg->indebt_since = 0;
  1261. propagate_weights(iocg, iocg->active, iocg->last_inuse,
  1262. false, now);
  1263. }
  1264. }
  1265. static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
  1266. int flags, void *key)
  1267. {
  1268. struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
  1269. struct iocg_wake_ctx *ctx = key;
  1270. u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
  1271. ctx->vbudget -= cost;
  1272. if (ctx->vbudget < 0)
  1273. return -1;
  1274. iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
  1275. wait->committed = true;
  1276. /*
  1277. * autoremove_wake_function() removes the wait entry only when it
  1278. * actually changed the task state. We want the wait always removed.
  1279. * Remove explicitly and use default_wake_function(). Note that the
  1280. * order of operations is important as finish_wait() tests whether
  1281. * @wq_entry is removed without grabbing the lock.
  1282. */
  1283. default_wake_function(wq_entry, mode, flags, key);
  1284. list_del_init_careful(&wq_entry->entry);
  1285. return 0;
  1286. }
  1287. /*
  1288. * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
  1289. * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
  1290. * addition to iocg->waitq.lock.
  1291. */
  1292. static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
  1293. struct ioc_now *now)
  1294. {
  1295. struct ioc *ioc = iocg->ioc;
  1296. struct iocg_wake_ctx ctx = { .iocg = iocg };
  1297. u64 vshortage, expires, oexpires;
  1298. s64 vbudget;
  1299. u32 hwa;
  1300. lockdep_assert_held(&iocg->waitq.lock);
  1301. current_hweight(iocg, &hwa, NULL);
  1302. vbudget = now->vnow - atomic64_read(&iocg->vtime);
  1303. /* pay off debt */
  1304. if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
  1305. u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
  1306. u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
  1307. u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
  1308. lockdep_assert_held(&ioc->lock);
  1309. atomic64_add(vpay, &iocg->vtime);
  1310. atomic64_add(vpay, &iocg->done_vtime);
  1311. iocg_pay_debt(iocg, abs_vpay, now);
  1312. vbudget -= vpay;
  1313. }
  1314. if (iocg->abs_vdebt || iocg->delay)
  1315. iocg_kick_delay(iocg, now);
  1316. /*
  1317. * Debt can still be outstanding if we haven't paid all yet or the
  1318. * caller raced and called without @pay_debt. Shouldn't wake up waiters
  1319. * under debt. Make sure @vbudget reflects the outstanding amount and is
  1320. * not positive.
  1321. */
  1322. if (iocg->abs_vdebt) {
  1323. s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
  1324. vbudget = min_t(s64, 0, vbudget - vdebt);
  1325. }
  1326. /*
  1327. * Wake up the ones which are due and see how much vtime we'll need for
  1328. * the next one. As paying off debt restores hw_inuse, it must be read
  1329. * after the above debt payment.
  1330. */
  1331. ctx.vbudget = vbudget;
  1332. current_hweight(iocg, NULL, &ctx.hw_inuse);
  1333. __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
  1334. if (!waitqueue_active(&iocg->waitq)) {
  1335. if (iocg->wait_since) {
  1336. iocg->stat.wait_us += now->now - iocg->wait_since;
  1337. iocg->wait_since = 0;
  1338. }
  1339. return;
  1340. }
  1341. if (!iocg->wait_since)
  1342. iocg->wait_since = now->now;
  1343. if (WARN_ON_ONCE(ctx.vbudget >= 0))
  1344. return;
  1345. /* determine next wakeup, add a timer margin to guarantee chunking */
  1346. vshortage = -ctx.vbudget;
  1347. expires = now->now_ns +
  1348. DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
  1349. NSEC_PER_USEC;
  1350. expires += ioc->timer_slack_ns;
  1351. /* if already active and close enough, don't bother */
  1352. oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
  1353. if (hrtimer_is_queued(&iocg->waitq_timer) &&
  1354. abs(oexpires - expires) <= ioc->timer_slack_ns)
  1355. return;
  1356. hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
  1357. ioc->timer_slack_ns, HRTIMER_MODE_ABS);
  1358. }
  1359. static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
  1360. {
  1361. struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
  1362. bool pay_debt = READ_ONCE(iocg->abs_vdebt);
  1363. struct ioc_now now;
  1364. unsigned long flags;
  1365. ioc_now(iocg->ioc, &now);
  1366. iocg_lock(iocg, pay_debt, &flags);
  1367. iocg_kick_waitq(iocg, pay_debt, &now);
  1368. iocg_unlock(iocg, pay_debt, &flags);
  1369. return HRTIMER_NORESTART;
  1370. }
  1371. static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
  1372. {
  1373. u32 nr_met[2] = { };
  1374. u32 nr_missed[2] = { };
  1375. u64 rq_wait_ns = 0;
  1376. int cpu, rw;
  1377. for_each_online_cpu(cpu) {
  1378. struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
  1379. u64 this_rq_wait_ns;
  1380. for (rw = READ; rw <= WRITE; rw++) {
  1381. u32 this_met = local_read(&stat->missed[rw].nr_met);
  1382. u32 this_missed = local_read(&stat->missed[rw].nr_missed);
  1383. nr_met[rw] += this_met - stat->missed[rw].last_met;
  1384. nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
  1385. stat->missed[rw].last_met = this_met;
  1386. stat->missed[rw].last_missed = this_missed;
  1387. }
  1388. this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
  1389. rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
  1390. stat->last_rq_wait_ns = this_rq_wait_ns;
  1391. }
  1392. for (rw = READ; rw <= WRITE; rw++) {
  1393. if (nr_met[rw] + nr_missed[rw])
  1394. missed_ppm_ar[rw] =
  1395. DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
  1396. nr_met[rw] + nr_missed[rw]);
  1397. else
  1398. missed_ppm_ar[rw] = 0;
  1399. }
  1400. *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
  1401. ioc->period_us * NSEC_PER_USEC);
  1402. }
  1403. /* was iocg idle this period? */
  1404. static bool iocg_is_idle(struct ioc_gq *iocg)
  1405. {
  1406. struct ioc *ioc = iocg->ioc;
  1407. /* did something get issued this period? */
  1408. if (atomic64_read(&iocg->active_period) ==
  1409. atomic64_read(&ioc->cur_period))
  1410. return false;
  1411. /* is something in flight? */
  1412. if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
  1413. return false;
  1414. return true;
  1415. }
  1416. /*
  1417. * Call this function on the target leaf @iocg's to build pre-order traversal
  1418. * list of all the ancestors in @inner_walk. The inner nodes are linked through
  1419. * ->walk_list and the caller is responsible for dissolving the list after use.
  1420. */
  1421. static void iocg_build_inner_walk(struct ioc_gq *iocg,
  1422. struct list_head *inner_walk)
  1423. {
  1424. int lvl;
  1425. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  1426. /* find the first ancestor which hasn't been visited yet */
  1427. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1428. if (!list_empty(&iocg->ancestors[lvl]->walk_list))
  1429. break;
  1430. }
  1431. /* walk down and visit the inner nodes to get pre-order traversal */
  1432. while (++lvl <= iocg->level - 1) {
  1433. struct ioc_gq *inner = iocg->ancestors[lvl];
  1434. /* record traversal order */
  1435. list_add_tail(&inner->walk_list, inner_walk);
  1436. }
  1437. }
  1438. /* propagate the deltas to the parent */
  1439. static void iocg_flush_stat_upward(struct ioc_gq *iocg)
  1440. {
  1441. if (iocg->level > 0) {
  1442. struct iocg_stat *parent_stat =
  1443. &iocg->ancestors[iocg->level - 1]->stat;
  1444. parent_stat->usage_us +=
  1445. iocg->stat.usage_us - iocg->last_stat.usage_us;
  1446. parent_stat->wait_us +=
  1447. iocg->stat.wait_us - iocg->last_stat.wait_us;
  1448. parent_stat->indebt_us +=
  1449. iocg->stat.indebt_us - iocg->last_stat.indebt_us;
  1450. parent_stat->indelay_us +=
  1451. iocg->stat.indelay_us - iocg->last_stat.indelay_us;
  1452. }
  1453. iocg->last_stat = iocg->stat;
  1454. }
  1455. /* collect per-cpu counters and propagate the deltas to the parent */
  1456. static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
  1457. {
  1458. struct ioc *ioc = iocg->ioc;
  1459. u64 abs_vusage = 0;
  1460. u64 vusage_delta;
  1461. int cpu;
  1462. lockdep_assert_held(&iocg->ioc->lock);
  1463. /* collect per-cpu counters */
  1464. for_each_possible_cpu(cpu) {
  1465. abs_vusage += local64_read(
  1466. per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
  1467. }
  1468. vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
  1469. iocg->last_stat_abs_vusage = abs_vusage;
  1470. iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
  1471. iocg->stat.usage_us += iocg->usage_delta_us;
  1472. iocg_flush_stat_upward(iocg);
  1473. }
  1474. /* get stat counters ready for reading on all active iocgs */
  1475. static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
  1476. {
  1477. LIST_HEAD(inner_walk);
  1478. struct ioc_gq *iocg, *tiocg;
  1479. /* flush leaves and build inner node walk list */
  1480. list_for_each_entry(iocg, target_iocgs, active_list) {
  1481. iocg_flush_stat_leaf(iocg, now);
  1482. iocg_build_inner_walk(iocg, &inner_walk);
  1483. }
  1484. /* keep flushing upwards by walking the inner list backwards */
  1485. list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
  1486. iocg_flush_stat_upward(iocg);
  1487. list_del_init(&iocg->walk_list);
  1488. }
  1489. }
  1490. /*
  1491. * Determine what @iocg's hweight_inuse should be after donating unused
  1492. * capacity. @hwm is the upper bound and used to signal no donation. This
  1493. * function also throws away @iocg's excess budget.
  1494. */
  1495. static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
  1496. u32 usage, struct ioc_now *now)
  1497. {
  1498. struct ioc *ioc = iocg->ioc;
  1499. u64 vtime = atomic64_read(&iocg->vtime);
  1500. s64 excess, delta, target, new_hwi;
  1501. /* debt handling owns inuse for debtors */
  1502. if (iocg->abs_vdebt)
  1503. return 1;
  1504. /* see whether minimum margin requirement is met */
  1505. if (waitqueue_active(&iocg->waitq) ||
  1506. time_after64(vtime, now->vnow - ioc->margins.min))
  1507. return hwm;
  1508. /* throw away excess above target */
  1509. excess = now->vnow - vtime - ioc->margins.target;
  1510. if (excess > 0) {
  1511. atomic64_add(excess, &iocg->vtime);
  1512. atomic64_add(excess, &iocg->done_vtime);
  1513. vtime += excess;
  1514. ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
  1515. }
  1516. /*
  1517. * Let's say the distance between iocg's and device's vtimes as a
  1518. * fraction of period duration is delta. Assuming that the iocg will
  1519. * consume the usage determined above, we want to determine new_hwi so
  1520. * that delta equals MARGIN_TARGET at the end of the next period.
  1521. *
  1522. * We need to execute usage worth of IOs while spending the sum of the
  1523. * new budget (1 - MARGIN_TARGET) and the leftover from the last period
  1524. * (delta):
  1525. *
  1526. * usage = (1 - MARGIN_TARGET + delta) * new_hwi
  1527. *
  1528. * Therefore, the new_hwi is:
  1529. *
  1530. * new_hwi = usage / (1 - MARGIN_TARGET + delta)
  1531. */
  1532. delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
  1533. now->vnow - ioc->period_at_vtime);
  1534. target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
  1535. new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
  1536. return clamp_t(s64, new_hwi, 1, hwm);
  1537. }
  1538. /*
  1539. * For work-conservation, an iocg which isn't using all of its share should
  1540. * donate the leftover to other iocgs. There are two ways to achieve this - 1.
  1541. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
  1542. *
  1543. * #1 is mathematically simpler but has the drawback of requiring synchronous
  1544. * global hweight_inuse updates when idle iocg's get activated or inuse weights
  1545. * change due to donation snapbacks as it has the possibility of grossly
  1546. * overshooting what's allowed by the model and vrate.
  1547. *
  1548. * #2 is inherently safe with local operations. The donating iocg can easily
  1549. * snap back to higher weights when needed without worrying about impacts on
  1550. * other nodes as the impacts will be inherently correct. This also makes idle
  1551. * iocg activations safe. The only effect activations have is decreasing
  1552. * hweight_inuse of others, the right solution to which is for those iocgs to
  1553. * snap back to higher weights.
  1554. *
  1555. * So, we go with #2. The challenge is calculating how each donating iocg's
  1556. * inuse should be adjusted to achieve the target donation amounts. This is done
  1557. * using Andy's method described in the following pdf.
  1558. *
  1559. * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
  1560. *
  1561. * Given the weights and target after-donation hweight_inuse values, Andy's
  1562. * method determines how the proportional distribution should look like at each
  1563. * sibling level to maintain the relative relationship between all non-donating
  1564. * pairs. To roughly summarize, it divides the tree into donating and
  1565. * non-donating parts, calculates global donation rate which is used to
  1566. * determine the target hweight_inuse for each node, and then derives per-level
  1567. * proportions.
  1568. *
  1569. * The following pdf shows that global distribution calculated this way can be
  1570. * achieved by scaling inuse weights of donating leaves and propagating the
  1571. * adjustments upwards proportionally.
  1572. *
  1573. * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
  1574. *
  1575. * Combining the above two, we can determine how each leaf iocg's inuse should
  1576. * be adjusted to achieve the target donation.
  1577. *
  1578. * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
  1579. *
  1580. * The inline comments use symbols from the last pdf.
  1581. *
  1582. * b is the sum of the absolute budgets in the subtree. 1 for the root node.
  1583. * f is the sum of the absolute budgets of non-donating nodes in the subtree.
  1584. * t is the sum of the absolute budgets of donating nodes in the subtree.
  1585. * w is the weight of the node. w = w_f + w_t
  1586. * w_f is the non-donating portion of w. w_f = w * f / b
  1587. * w_b is the donating portion of w. w_t = w * t / b
  1588. * s is the sum of all sibling weights. s = Sum(w) for siblings
  1589. * s_f and s_t are the non-donating and donating portions of s.
  1590. *
  1591. * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
  1592. * w_pt is the donating portion of the parent's weight and w'_pt the same value
  1593. * after adjustments. Subscript r denotes the root node's values.
  1594. */
  1595. static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
  1596. {
  1597. LIST_HEAD(over_hwa);
  1598. LIST_HEAD(inner_walk);
  1599. struct ioc_gq *iocg, *tiocg, *root_iocg;
  1600. u32 after_sum, over_sum, over_target, gamma;
  1601. /*
  1602. * It's pretty unlikely but possible for the total sum of
  1603. * hweight_after_donation's to be higher than WEIGHT_ONE, which will
  1604. * confuse the following calculations. If such condition is detected,
  1605. * scale down everyone over its full share equally to keep the sum below
  1606. * WEIGHT_ONE.
  1607. */
  1608. after_sum = 0;
  1609. over_sum = 0;
  1610. list_for_each_entry(iocg, surpluses, surplus_list) {
  1611. u32 hwa;
  1612. current_hweight(iocg, &hwa, NULL);
  1613. after_sum += iocg->hweight_after_donation;
  1614. if (iocg->hweight_after_donation > hwa) {
  1615. over_sum += iocg->hweight_after_donation;
  1616. list_add(&iocg->walk_list, &over_hwa);
  1617. }
  1618. }
  1619. if (after_sum >= WEIGHT_ONE) {
  1620. /*
  1621. * The delta should be deducted from the over_sum, calculate
  1622. * target over_sum value.
  1623. */
  1624. u32 over_delta = after_sum - (WEIGHT_ONE - 1);
  1625. WARN_ON_ONCE(over_sum <= over_delta);
  1626. over_target = over_sum - over_delta;
  1627. } else {
  1628. over_target = 0;
  1629. }
  1630. list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
  1631. if (over_target)
  1632. iocg->hweight_after_donation =
  1633. div_u64((u64)iocg->hweight_after_donation *
  1634. over_target, over_sum);
  1635. list_del_init(&iocg->walk_list);
  1636. }
  1637. /*
  1638. * Build pre-order inner node walk list and prepare for donation
  1639. * adjustment calculations.
  1640. */
  1641. list_for_each_entry(iocg, surpluses, surplus_list) {
  1642. iocg_build_inner_walk(iocg, &inner_walk);
  1643. }
  1644. root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
  1645. WARN_ON_ONCE(root_iocg->level > 0);
  1646. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1647. iocg->child_adjusted_sum = 0;
  1648. iocg->hweight_donating = 0;
  1649. iocg->hweight_after_donation = 0;
  1650. }
  1651. /*
  1652. * Propagate the donating budget (b_t) and after donation budget (b'_t)
  1653. * up the hierarchy.
  1654. */
  1655. list_for_each_entry(iocg, surpluses, surplus_list) {
  1656. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1657. parent->hweight_donating += iocg->hweight_donating;
  1658. parent->hweight_after_donation += iocg->hweight_after_donation;
  1659. }
  1660. list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
  1661. if (iocg->level > 0) {
  1662. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1663. parent->hweight_donating += iocg->hweight_donating;
  1664. parent->hweight_after_donation += iocg->hweight_after_donation;
  1665. }
  1666. }
  1667. /*
  1668. * Calculate inner hwa's (b) and make sure the donation values are
  1669. * within the accepted ranges as we're doing low res calculations with
  1670. * roundups.
  1671. */
  1672. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1673. if (iocg->level) {
  1674. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1675. iocg->hweight_active = DIV64_U64_ROUND_UP(
  1676. (u64)parent->hweight_active * iocg->active,
  1677. parent->child_active_sum);
  1678. }
  1679. iocg->hweight_donating = min(iocg->hweight_donating,
  1680. iocg->hweight_active);
  1681. iocg->hweight_after_donation = min(iocg->hweight_after_donation,
  1682. iocg->hweight_donating - 1);
  1683. if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
  1684. iocg->hweight_donating <= 1 ||
  1685. iocg->hweight_after_donation == 0)) {
  1686. pr_warn("iocg: invalid donation weights in ");
  1687. pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
  1688. pr_cont(": active=%u donating=%u after=%u\n",
  1689. iocg->hweight_active, iocg->hweight_donating,
  1690. iocg->hweight_after_donation);
  1691. }
  1692. }
  1693. /*
  1694. * Calculate the global donation rate (gamma) - the rate to adjust
  1695. * non-donating budgets by.
  1696. *
  1697. * No need to use 64bit multiplication here as the first operand is
  1698. * guaranteed to be smaller than WEIGHT_ONE (1<<16).
  1699. *
  1700. * We know that there are beneficiary nodes and the sum of the donating
  1701. * hweights can't be whole; however, due to the round-ups during hweight
  1702. * calculations, root_iocg->hweight_donating might still end up equal to
  1703. * or greater than whole. Limit the range when calculating the divider.
  1704. *
  1705. * gamma = (1 - t_r') / (1 - t_r)
  1706. */
  1707. gamma = DIV_ROUND_UP(
  1708. (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
  1709. WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
  1710. /*
  1711. * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
  1712. * nodes.
  1713. */
  1714. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1715. struct ioc_gq *parent;
  1716. u32 inuse, wpt, wptp;
  1717. u64 st, sf;
  1718. if (iocg->level == 0) {
  1719. /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
  1720. iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
  1721. iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
  1722. WEIGHT_ONE - iocg->hweight_after_donation);
  1723. continue;
  1724. }
  1725. parent = iocg->ancestors[iocg->level - 1];
  1726. /* b' = gamma * b_f + b_t' */
  1727. iocg->hweight_inuse = DIV64_U64_ROUND_UP(
  1728. (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
  1729. WEIGHT_ONE) + iocg->hweight_after_donation;
  1730. /* w' = s' * b' / b'_p */
  1731. inuse = DIV64_U64_ROUND_UP(
  1732. (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
  1733. parent->hweight_inuse);
  1734. /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
  1735. st = DIV64_U64_ROUND_UP(
  1736. iocg->child_active_sum * iocg->hweight_donating,
  1737. iocg->hweight_active);
  1738. sf = iocg->child_active_sum - st;
  1739. wpt = DIV64_U64_ROUND_UP(
  1740. (u64)iocg->active * iocg->hweight_donating,
  1741. iocg->hweight_active);
  1742. wptp = DIV64_U64_ROUND_UP(
  1743. (u64)inuse * iocg->hweight_after_donation,
  1744. iocg->hweight_inuse);
  1745. iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
  1746. }
  1747. /*
  1748. * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
  1749. * we can finally determine leaf adjustments.
  1750. */
  1751. list_for_each_entry(iocg, surpluses, surplus_list) {
  1752. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1753. u32 inuse;
  1754. /*
  1755. * In-debt iocgs participated in the donation calculation with
  1756. * the minimum target hweight_inuse. Configuring inuse
  1757. * accordingly would work fine but debt handling expects
  1758. * @iocg->inuse stay at the minimum and we don't wanna
  1759. * interfere.
  1760. */
  1761. if (iocg->abs_vdebt) {
  1762. WARN_ON_ONCE(iocg->inuse > 1);
  1763. continue;
  1764. }
  1765. /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
  1766. inuse = DIV64_U64_ROUND_UP(
  1767. parent->child_adjusted_sum * iocg->hweight_after_donation,
  1768. parent->hweight_inuse);
  1769. TRACE_IOCG_PATH(inuse_transfer, iocg, now,
  1770. iocg->inuse, inuse,
  1771. iocg->hweight_inuse,
  1772. iocg->hweight_after_donation);
  1773. __propagate_weights(iocg, iocg->active, inuse, true, now);
  1774. }
  1775. /* walk list should be dissolved after use */
  1776. list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
  1777. list_del_init(&iocg->walk_list);
  1778. }
  1779. /*
  1780. * A low weight iocg can amass a large amount of debt, for example, when
  1781. * anonymous memory gets reclaimed aggressively. If the system has a lot of
  1782. * memory paired with a slow IO device, the debt can span multiple seconds or
  1783. * more. If there are no other subsequent IO issuers, the in-debt iocg may end
  1784. * up blocked paying its debt while the IO device is idle.
  1785. *
  1786. * The following protects against such cases. If the device has been
  1787. * sufficiently idle for a while, the debts are halved and delays are
  1788. * recalculated.
  1789. */
  1790. static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
  1791. struct ioc_now *now)
  1792. {
  1793. struct ioc_gq *iocg;
  1794. u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
  1795. /* if no debtor, reset the cycle */
  1796. if (!nr_debtors) {
  1797. ioc->dfgv_period_at = now->now;
  1798. ioc->dfgv_period_rem = 0;
  1799. ioc->dfgv_usage_us_sum = 0;
  1800. return;
  1801. }
  1802. /*
  1803. * Debtors can pass through a lot of writes choking the device and we
  1804. * don't want to be forgiving debts while the device is struggling from
  1805. * write bursts. If we're missing latency targets, consider the device
  1806. * fully utilized.
  1807. */
  1808. if (ioc->busy_level > 0)
  1809. usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
  1810. ioc->dfgv_usage_us_sum += usage_us_sum;
  1811. if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
  1812. return;
  1813. /*
  1814. * At least DFGV_PERIOD has passed since the last period. Calculate the
  1815. * average usage and reset the period counters.
  1816. */
  1817. dur = now->now - ioc->dfgv_period_at;
  1818. usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
  1819. ioc->dfgv_period_at = now->now;
  1820. ioc->dfgv_usage_us_sum = 0;
  1821. /* if was too busy, reset everything */
  1822. if (usage_pct > DFGV_USAGE_PCT) {
  1823. ioc->dfgv_period_rem = 0;
  1824. return;
  1825. }
  1826. /*
  1827. * Usage is lower than threshold. Let's forgive some debts. Debt
  1828. * forgiveness runs off of the usual ioc timer but its period usually
  1829. * doesn't match ioc's. Compensate the difference by performing the
  1830. * reduction as many times as would fit in the duration since the last
  1831. * run and carrying over the left-over duration in @ioc->dfgv_period_rem
  1832. * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
  1833. * reductions is doubled.
  1834. */
  1835. nr_cycles = dur + ioc->dfgv_period_rem;
  1836. ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
  1837. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1838. u64 __maybe_unused old_debt, __maybe_unused old_delay;
  1839. if (!iocg->abs_vdebt && !iocg->delay)
  1840. continue;
  1841. spin_lock(&iocg->waitq.lock);
  1842. old_debt = iocg->abs_vdebt;
  1843. old_delay = iocg->delay;
  1844. nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
  1845. if (iocg->abs_vdebt)
  1846. iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
  1847. if (iocg->delay)
  1848. iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
  1849. iocg_kick_waitq(iocg, true, now);
  1850. TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
  1851. old_debt, iocg->abs_vdebt,
  1852. old_delay, iocg->delay);
  1853. spin_unlock(&iocg->waitq.lock);
  1854. }
  1855. }
  1856. /*
  1857. * Check the active iocgs' state to avoid oversleeping and deactive
  1858. * idle iocgs.
  1859. *
  1860. * Since waiters determine the sleep durations based on the vrate
  1861. * they saw at the time of sleep, if vrate has increased, some
  1862. * waiters could be sleeping for too long. Wake up tardy waiters
  1863. * which should have woken up in the last period and expire idle
  1864. * iocgs.
  1865. */
  1866. static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
  1867. {
  1868. int nr_debtors = 0;
  1869. struct ioc_gq *iocg, *tiocg;
  1870. list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
  1871. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  1872. !iocg->delay && !iocg_is_idle(iocg))
  1873. continue;
  1874. spin_lock(&iocg->waitq.lock);
  1875. /* flush wait and indebt stat deltas */
  1876. if (iocg->wait_since) {
  1877. iocg->stat.wait_us += now->now - iocg->wait_since;
  1878. iocg->wait_since = now->now;
  1879. }
  1880. if (iocg->indebt_since) {
  1881. iocg->stat.indebt_us +=
  1882. now->now - iocg->indebt_since;
  1883. iocg->indebt_since = now->now;
  1884. }
  1885. if (iocg->indelay_since) {
  1886. iocg->stat.indelay_us +=
  1887. now->now - iocg->indelay_since;
  1888. iocg->indelay_since = now->now;
  1889. }
  1890. if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
  1891. iocg->delay) {
  1892. /* might be oversleeping vtime / hweight changes, kick */
  1893. iocg_kick_waitq(iocg, true, now);
  1894. if (iocg->abs_vdebt || iocg->delay)
  1895. nr_debtors++;
  1896. } else if (iocg_is_idle(iocg)) {
  1897. /* no waiter and idle, deactivate */
  1898. u64 vtime = atomic64_read(&iocg->vtime);
  1899. s64 excess;
  1900. /*
  1901. * @iocg has been inactive for a full duration and will
  1902. * have a high budget. Account anything above target as
  1903. * error and throw away. On reactivation, it'll start
  1904. * with the target budget.
  1905. */
  1906. excess = now->vnow - vtime - ioc->margins.target;
  1907. if (excess > 0) {
  1908. u32 old_hwi;
  1909. current_hweight(iocg, NULL, &old_hwi);
  1910. ioc->vtime_err -= div64_u64(excess * old_hwi,
  1911. WEIGHT_ONE);
  1912. }
  1913. TRACE_IOCG_PATH(iocg_idle, iocg, now,
  1914. atomic64_read(&iocg->active_period),
  1915. atomic64_read(&ioc->cur_period), vtime);
  1916. __propagate_weights(iocg, 0, 0, false, now);
  1917. list_del_init(&iocg->active_list);
  1918. }
  1919. spin_unlock(&iocg->waitq.lock);
  1920. }
  1921. commit_weights(ioc);
  1922. return nr_debtors;
  1923. }
  1924. static void ioc_timer_fn(struct timer_list *timer)
  1925. {
  1926. struct ioc *ioc = container_of(timer, struct ioc, timer);
  1927. struct ioc_gq *iocg, *tiocg;
  1928. struct ioc_now now;
  1929. LIST_HEAD(surpluses);
  1930. int nr_debtors, nr_shortages = 0, nr_lagging = 0;
  1931. u64 usage_us_sum = 0;
  1932. u32 ppm_rthr;
  1933. u32 ppm_wthr;
  1934. u32 missed_ppm[2], rq_wait_pct;
  1935. u64 period_vtime;
  1936. int prev_busy_level;
  1937. /* how were the latencies during the period? */
  1938. ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
  1939. /* take care of active iocgs */
  1940. spin_lock_irq(&ioc->lock);
  1941. ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
  1942. ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
  1943. ioc_now(ioc, &now);
  1944. period_vtime = now.vnow - ioc->period_at_vtime;
  1945. if (WARN_ON_ONCE(!period_vtime)) {
  1946. spin_unlock_irq(&ioc->lock);
  1947. return;
  1948. }
  1949. nr_debtors = ioc_check_iocgs(ioc, &now);
  1950. /*
  1951. * Wait and indebt stat are flushed above and the donation calculation
  1952. * below needs updated usage stat. Let's bring stat up-to-date.
  1953. */
  1954. iocg_flush_stat(&ioc->active_iocgs, &now);
  1955. /* calc usage and see whether some weights need to be moved around */
  1956. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1957. u64 vdone, vtime, usage_us;
  1958. u32 hw_active, hw_inuse;
  1959. /*
  1960. * Collect unused and wind vtime closer to vnow to prevent
  1961. * iocgs from accumulating a large amount of budget.
  1962. */
  1963. vdone = atomic64_read(&iocg->done_vtime);
  1964. vtime = atomic64_read(&iocg->vtime);
  1965. current_hweight(iocg, &hw_active, &hw_inuse);
  1966. /*
  1967. * Latency QoS detection doesn't account for IOs which are
  1968. * in-flight for longer than a period. Detect them by
  1969. * comparing vdone against period start. If lagging behind
  1970. * IOs from past periods, don't increase vrate.
  1971. */
  1972. if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
  1973. !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
  1974. time_after64(vtime, vdone) &&
  1975. time_after64(vtime, now.vnow -
  1976. MAX_LAGGING_PERIODS * period_vtime) &&
  1977. time_before64(vdone, now.vnow - period_vtime))
  1978. nr_lagging++;
  1979. /*
  1980. * Determine absolute usage factoring in in-flight IOs to avoid
  1981. * high-latency completions appearing as idle.
  1982. */
  1983. usage_us = iocg->usage_delta_us;
  1984. usage_us_sum += usage_us;
  1985. /* see whether there's surplus vtime */
  1986. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  1987. if (hw_inuse < hw_active ||
  1988. (!waitqueue_active(&iocg->waitq) &&
  1989. time_before64(vtime, now.vnow - ioc->margins.low))) {
  1990. u32 hwa, old_hwi, hwm, new_hwi, usage;
  1991. u64 usage_dur;
  1992. if (vdone != vtime) {
  1993. u64 inflight_us = DIV64_U64_ROUND_UP(
  1994. cost_to_abs_cost(vtime - vdone, hw_inuse),
  1995. ioc->vtime_base_rate);
  1996. usage_us = max(usage_us, inflight_us);
  1997. }
  1998. /* convert to hweight based usage ratio */
  1999. if (time_after64(iocg->activated_at, ioc->period_at))
  2000. usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
  2001. else
  2002. usage_dur = max_t(u64, now.now - ioc->period_at, 1);
  2003. usage = clamp_t(u32,
  2004. DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
  2005. usage_dur),
  2006. 1, WEIGHT_ONE);
  2007. /*
  2008. * Already donating or accumulated enough to start.
  2009. * Determine the donation amount.
  2010. */
  2011. current_hweight(iocg, &hwa, &old_hwi);
  2012. hwm = current_hweight_max(iocg);
  2013. new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
  2014. usage, &now);
  2015. /*
  2016. * Donation calculation assumes hweight_after_donation
  2017. * to be positive, a condition that a donor w/ hwa < 2
  2018. * can't meet. Don't bother with donation if hwa is
  2019. * below 2. It's not gonna make a meaningful difference
  2020. * anyway.
  2021. */
  2022. if (new_hwi < hwm && hwa >= 2) {
  2023. iocg->hweight_donating = hwa;
  2024. iocg->hweight_after_donation = new_hwi;
  2025. list_add(&iocg->surplus_list, &surpluses);
  2026. } else if (!iocg->abs_vdebt) {
  2027. /*
  2028. * @iocg doesn't have enough to donate. Reset
  2029. * its inuse to active.
  2030. *
  2031. * Don't reset debtors as their inuse's are
  2032. * owned by debt handling. This shouldn't affect
  2033. * donation calculuation in any meaningful way
  2034. * as @iocg doesn't have a meaningful amount of
  2035. * share anyway.
  2036. */
  2037. TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
  2038. iocg->inuse, iocg->active,
  2039. iocg->hweight_inuse, new_hwi);
  2040. __propagate_weights(iocg, iocg->active,
  2041. iocg->active, true, &now);
  2042. nr_shortages++;
  2043. }
  2044. } else {
  2045. /* genuinely short on vtime */
  2046. nr_shortages++;
  2047. }
  2048. }
  2049. if (!list_empty(&surpluses) && nr_shortages)
  2050. transfer_surpluses(&surpluses, &now);
  2051. commit_weights(ioc);
  2052. /* surplus list should be dissolved after use */
  2053. list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
  2054. list_del_init(&iocg->surplus_list);
  2055. /*
  2056. * If q is getting clogged or we're missing too much, we're issuing
  2057. * too much IO and should lower vtime rate. If we're not missing
  2058. * and experiencing shortages but not surpluses, we're too stingy
  2059. * and should increase vtime rate.
  2060. */
  2061. prev_busy_level = ioc->busy_level;
  2062. if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
  2063. missed_ppm[READ] > ppm_rthr ||
  2064. missed_ppm[WRITE] > ppm_wthr) {
  2065. /* clearly missing QoS targets, slow down vrate */
  2066. ioc->busy_level = max(ioc->busy_level, 0);
  2067. ioc->busy_level++;
  2068. } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
  2069. missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
  2070. missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
  2071. /* QoS targets are being met with >25% margin */
  2072. if (nr_shortages) {
  2073. /*
  2074. * We're throttling while the device has spare
  2075. * capacity. If vrate was being slowed down, stop.
  2076. */
  2077. ioc->busy_level = min(ioc->busy_level, 0);
  2078. /*
  2079. * If there are IOs spanning multiple periods, wait
  2080. * them out before pushing the device harder.
  2081. */
  2082. if (!nr_lagging)
  2083. ioc->busy_level--;
  2084. } else {
  2085. /*
  2086. * Nobody is being throttled and the users aren't
  2087. * issuing enough IOs to saturate the device. We
  2088. * simply don't know how close the device is to
  2089. * saturation. Coast.
  2090. */
  2091. ioc->busy_level = 0;
  2092. }
  2093. } else {
  2094. /* inside the hysterisis margin, we're good */
  2095. ioc->busy_level = 0;
  2096. }
  2097. ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
  2098. ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
  2099. prev_busy_level, missed_ppm);
  2100. ioc_refresh_params(ioc, false);
  2101. ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
  2102. /*
  2103. * This period is done. Move onto the next one. If nothing's
  2104. * going on with the device, stop the timer.
  2105. */
  2106. atomic64_inc(&ioc->cur_period);
  2107. if (ioc->running != IOC_STOP) {
  2108. if (!list_empty(&ioc->active_iocgs)) {
  2109. ioc_start_period(ioc, &now);
  2110. } else {
  2111. ioc->busy_level = 0;
  2112. ioc->vtime_err = 0;
  2113. ioc->running = IOC_IDLE;
  2114. }
  2115. ioc_refresh_vrate(ioc, &now);
  2116. }
  2117. spin_unlock_irq(&ioc->lock);
  2118. }
  2119. static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
  2120. u64 abs_cost, struct ioc_now *now)
  2121. {
  2122. struct ioc *ioc = iocg->ioc;
  2123. struct ioc_margins *margins = &ioc->margins;
  2124. u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
  2125. u32 hwi, adj_step;
  2126. s64 margin;
  2127. u64 cost, new_inuse;
  2128. unsigned long flags;
  2129. current_hweight(iocg, NULL, &hwi);
  2130. old_hwi = hwi;
  2131. cost = abs_cost_to_cost(abs_cost, hwi);
  2132. margin = now->vnow - vtime - cost;
  2133. /* debt handling owns inuse for debtors */
  2134. if (iocg->abs_vdebt)
  2135. return cost;
  2136. /*
  2137. * We only increase inuse during period and do so if the margin has
  2138. * deteriorated since the previous adjustment.
  2139. */
  2140. if (margin >= iocg->saved_margin || margin >= margins->low ||
  2141. iocg->inuse == iocg->active)
  2142. return cost;
  2143. spin_lock_irqsave(&ioc->lock, flags);
  2144. /* we own inuse only when @iocg is in the normal active state */
  2145. if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
  2146. spin_unlock_irqrestore(&ioc->lock, flags);
  2147. return cost;
  2148. }
  2149. /*
  2150. * Bump up inuse till @abs_cost fits in the existing budget.
  2151. * adj_step must be determined after acquiring ioc->lock - we might
  2152. * have raced and lost to another thread for activation and could
  2153. * be reading 0 iocg->active before ioc->lock which will lead to
  2154. * infinite loop.
  2155. */
  2156. new_inuse = iocg->inuse;
  2157. adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
  2158. do {
  2159. new_inuse = new_inuse + adj_step;
  2160. propagate_weights(iocg, iocg->active, new_inuse, true, now);
  2161. current_hweight(iocg, NULL, &hwi);
  2162. cost = abs_cost_to_cost(abs_cost, hwi);
  2163. } while (time_after64(vtime + cost, now->vnow) &&
  2164. iocg->inuse != iocg->active);
  2165. spin_unlock_irqrestore(&ioc->lock, flags);
  2166. TRACE_IOCG_PATH(inuse_adjust, iocg, now,
  2167. old_inuse, iocg->inuse, old_hwi, hwi);
  2168. return cost;
  2169. }
  2170. static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
  2171. bool is_merge, u64 *costp)
  2172. {
  2173. struct ioc *ioc = iocg->ioc;
  2174. u64 coef_seqio, coef_randio, coef_page;
  2175. u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
  2176. u64 seek_pages = 0;
  2177. u64 cost = 0;
  2178. /* Can't calculate cost for empty bio */
  2179. if (!bio->bi_iter.bi_size)
  2180. goto out;
  2181. switch (bio_op(bio)) {
  2182. case REQ_OP_READ:
  2183. coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
  2184. coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
  2185. coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
  2186. break;
  2187. case REQ_OP_WRITE:
  2188. coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
  2189. coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
  2190. coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
  2191. break;
  2192. default:
  2193. goto out;
  2194. }
  2195. if (iocg->cursor) {
  2196. seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
  2197. seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
  2198. }
  2199. if (!is_merge) {
  2200. if (seek_pages > LCOEF_RANDIO_PAGES) {
  2201. cost += coef_randio;
  2202. } else {
  2203. cost += coef_seqio;
  2204. }
  2205. }
  2206. cost += pages * coef_page;
  2207. out:
  2208. *costp = cost;
  2209. }
  2210. static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
  2211. {
  2212. u64 cost;
  2213. calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
  2214. return cost;
  2215. }
  2216. static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
  2217. u64 *costp)
  2218. {
  2219. unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
  2220. switch (req_op(rq)) {
  2221. case REQ_OP_READ:
  2222. *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
  2223. break;
  2224. case REQ_OP_WRITE:
  2225. *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
  2226. break;
  2227. default:
  2228. *costp = 0;
  2229. }
  2230. }
  2231. static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
  2232. {
  2233. u64 cost;
  2234. calc_size_vtime_cost_builtin(rq, ioc, &cost);
  2235. return cost;
  2236. }
  2237. static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
  2238. {
  2239. struct blkcg_gq *blkg = bio->bi_blkg;
  2240. struct ioc *ioc = rqos_to_ioc(rqos);
  2241. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2242. struct ioc_now now;
  2243. struct iocg_wait wait;
  2244. u64 abs_cost, cost, vtime;
  2245. bool use_debt, ioc_locked;
  2246. unsigned long flags;
  2247. /* bypass IOs if disabled, still initializing, or for root cgroup */
  2248. if (!ioc->enabled || !iocg || !iocg->level)
  2249. return;
  2250. /* calculate the absolute vtime cost */
  2251. abs_cost = calc_vtime_cost(bio, iocg, false);
  2252. if (!abs_cost)
  2253. return;
  2254. if (!iocg_activate(iocg, &now))
  2255. return;
  2256. iocg->cursor = bio_end_sector(bio);
  2257. vtime = atomic64_read(&iocg->vtime);
  2258. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2259. /*
  2260. * If no one's waiting and within budget, issue right away. The
  2261. * tests are racy but the races aren't systemic - we only miss once
  2262. * in a while which is fine.
  2263. */
  2264. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  2265. time_before_eq64(vtime + cost, now.vnow)) {
  2266. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2267. return;
  2268. }
  2269. /*
  2270. * We're over budget. This can be handled in two ways. IOs which may
  2271. * cause priority inversions are punted to @ioc->aux_iocg and charged as
  2272. * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
  2273. * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
  2274. * whether debt handling is needed and acquire locks accordingly.
  2275. */
  2276. use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
  2277. ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
  2278. retry_lock:
  2279. iocg_lock(iocg, ioc_locked, &flags);
  2280. /*
  2281. * @iocg must stay activated for debt and waitq handling. Deactivation
  2282. * is synchronized against both ioc->lock and waitq.lock and we won't
  2283. * get deactivated as long as we're waiting or has debt, so we're good
  2284. * if we're activated here. In the unlikely cases that we aren't, just
  2285. * issue the IO.
  2286. */
  2287. if (unlikely(list_empty(&iocg->active_list))) {
  2288. iocg_unlock(iocg, ioc_locked, &flags);
  2289. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2290. return;
  2291. }
  2292. /*
  2293. * We're over budget. If @bio has to be issued regardless, remember
  2294. * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
  2295. * off the debt before waking more IOs.
  2296. *
  2297. * This way, the debt is continuously paid off each period with the
  2298. * actual budget available to the cgroup. If we just wound vtime, we
  2299. * would incorrectly use the current hw_inuse for the entire amount
  2300. * which, for example, can lead to the cgroup staying blocked for a
  2301. * long time even with substantially raised hw_inuse.
  2302. *
  2303. * An iocg with vdebt should stay online so that the timer can keep
  2304. * deducting its vdebt and [de]activate use_delay mechanism
  2305. * accordingly. We don't want to race against the timer trying to
  2306. * clear them and leave @iocg inactive w/ dangling use_delay heavily
  2307. * penalizing the cgroup and its descendants.
  2308. */
  2309. if (use_debt) {
  2310. iocg_incur_debt(iocg, abs_cost, &now);
  2311. if (iocg_kick_delay(iocg, &now))
  2312. blkcg_schedule_throttle(rqos->disk,
  2313. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2314. iocg_unlock(iocg, ioc_locked, &flags);
  2315. return;
  2316. }
  2317. /* guarantee that iocgs w/ waiters have maximum inuse */
  2318. if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
  2319. if (!ioc_locked) {
  2320. iocg_unlock(iocg, false, &flags);
  2321. ioc_locked = true;
  2322. goto retry_lock;
  2323. }
  2324. propagate_weights(iocg, iocg->active, iocg->active, true,
  2325. &now);
  2326. }
  2327. /*
  2328. * Append self to the waitq and schedule the wakeup timer if we're
  2329. * the first waiter. The timer duration is calculated based on the
  2330. * current vrate. vtime and hweight changes can make it too short
  2331. * or too long. Each wait entry records the absolute cost it's
  2332. * waiting for to allow re-evaluation using a custom wait entry.
  2333. *
  2334. * If too short, the timer simply reschedules itself. If too long,
  2335. * the period timer will notice and trigger wakeups.
  2336. *
  2337. * All waiters are on iocg->waitq and the wait states are
  2338. * synchronized using waitq.lock.
  2339. */
  2340. init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
  2341. wait.wait.private = current;
  2342. wait.bio = bio;
  2343. wait.abs_cost = abs_cost;
  2344. wait.committed = false; /* will be set true by waker */
  2345. __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
  2346. iocg_kick_waitq(iocg, ioc_locked, &now);
  2347. iocg_unlock(iocg, ioc_locked, &flags);
  2348. while (true) {
  2349. set_current_state(TASK_UNINTERRUPTIBLE);
  2350. if (wait.committed)
  2351. break;
  2352. io_schedule();
  2353. }
  2354. /* waker already committed us, proceed */
  2355. finish_wait(&iocg->waitq, &wait.wait);
  2356. }
  2357. static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
  2358. struct bio *bio)
  2359. {
  2360. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2361. struct ioc *ioc = rqos_to_ioc(rqos);
  2362. sector_t bio_end = bio_end_sector(bio);
  2363. struct ioc_now now;
  2364. u64 vtime, abs_cost, cost;
  2365. unsigned long flags;
  2366. /* bypass if disabled, still initializing, or for root cgroup */
  2367. if (!ioc->enabled || !iocg || !iocg->level)
  2368. return;
  2369. abs_cost = calc_vtime_cost(bio, iocg, true);
  2370. if (!abs_cost)
  2371. return;
  2372. ioc_now(ioc, &now);
  2373. vtime = atomic64_read(&iocg->vtime);
  2374. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2375. /* update cursor if backmerging into the request at the cursor */
  2376. if (blk_rq_pos(rq) < bio_end &&
  2377. blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
  2378. iocg->cursor = bio_end;
  2379. /*
  2380. * Charge if there's enough vtime budget and the existing request has
  2381. * cost assigned.
  2382. */
  2383. if (rq->bio && rq->bio->bi_iocost_cost &&
  2384. time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
  2385. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2386. return;
  2387. }
  2388. /*
  2389. * Otherwise, account it as debt if @iocg is online, which it should
  2390. * be for the vast majority of cases. See debt handling in
  2391. * ioc_rqos_throttle() for details.
  2392. */
  2393. spin_lock_irqsave(&ioc->lock, flags);
  2394. spin_lock(&iocg->waitq.lock);
  2395. if (likely(!list_empty(&iocg->active_list))) {
  2396. iocg_incur_debt(iocg, abs_cost, &now);
  2397. if (iocg_kick_delay(iocg, &now))
  2398. blkcg_schedule_throttle(rqos->disk,
  2399. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2400. } else {
  2401. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2402. }
  2403. spin_unlock(&iocg->waitq.lock);
  2404. spin_unlock_irqrestore(&ioc->lock, flags);
  2405. }
  2406. static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
  2407. {
  2408. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2409. if (iocg && bio->bi_iocost_cost)
  2410. atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
  2411. }
  2412. static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
  2413. {
  2414. struct ioc *ioc = rqos_to_ioc(rqos);
  2415. struct ioc_pcpu_stat *ccs;
  2416. u64 on_q_ns, rq_wait_ns, size_nsec;
  2417. int pidx, rw;
  2418. if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
  2419. return;
  2420. switch (req_op(rq)) {
  2421. case REQ_OP_READ:
  2422. pidx = QOS_RLAT;
  2423. rw = READ;
  2424. break;
  2425. case REQ_OP_WRITE:
  2426. pidx = QOS_WLAT;
  2427. rw = WRITE;
  2428. break;
  2429. default:
  2430. return;
  2431. }
  2432. on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
  2433. rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
  2434. size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
  2435. ccs = get_cpu_ptr(ioc->pcpu_stat);
  2436. if (on_q_ns <= size_nsec ||
  2437. on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
  2438. local_inc(&ccs->missed[rw].nr_met);
  2439. else
  2440. local_inc(&ccs->missed[rw].nr_missed);
  2441. local64_add(rq_wait_ns, &ccs->rq_wait_ns);
  2442. put_cpu_ptr(ccs);
  2443. }
  2444. static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
  2445. {
  2446. struct ioc *ioc = rqos_to_ioc(rqos);
  2447. spin_lock_irq(&ioc->lock);
  2448. ioc_refresh_params(ioc, false);
  2449. spin_unlock_irq(&ioc->lock);
  2450. }
  2451. static void ioc_rqos_exit(struct rq_qos *rqos)
  2452. {
  2453. struct ioc *ioc = rqos_to_ioc(rqos);
  2454. blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
  2455. spin_lock_irq(&ioc->lock);
  2456. ioc->running = IOC_STOP;
  2457. spin_unlock_irq(&ioc->lock);
  2458. timer_shutdown_sync(&ioc->timer);
  2459. free_percpu(ioc->pcpu_stat);
  2460. kfree(ioc);
  2461. }
  2462. static const struct rq_qos_ops ioc_rqos_ops = {
  2463. .throttle = ioc_rqos_throttle,
  2464. .merge = ioc_rqos_merge,
  2465. .done_bio = ioc_rqos_done_bio,
  2466. .done = ioc_rqos_done,
  2467. .queue_depth_changed = ioc_rqos_queue_depth_changed,
  2468. .exit = ioc_rqos_exit,
  2469. };
  2470. static int blk_iocost_init(struct gendisk *disk)
  2471. {
  2472. struct ioc *ioc;
  2473. int i, cpu, ret;
  2474. ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
  2475. if (!ioc)
  2476. return -ENOMEM;
  2477. ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
  2478. if (!ioc->pcpu_stat) {
  2479. kfree(ioc);
  2480. return -ENOMEM;
  2481. }
  2482. for_each_possible_cpu(cpu) {
  2483. struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
  2484. for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
  2485. local_set(&ccs->missed[i].nr_met, 0);
  2486. local_set(&ccs->missed[i].nr_missed, 0);
  2487. }
  2488. local64_set(&ccs->rq_wait_ns, 0);
  2489. }
  2490. spin_lock_init(&ioc->lock);
  2491. timer_setup(&ioc->timer, ioc_timer_fn, 0);
  2492. INIT_LIST_HEAD(&ioc->active_iocgs);
  2493. ioc->running = IOC_IDLE;
  2494. ioc->vtime_base_rate = VTIME_PER_USEC;
  2495. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  2496. seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
  2497. ioc->period_at = ktime_to_us(blk_time_get());
  2498. atomic64_set(&ioc->cur_period, 0);
  2499. atomic_set(&ioc->hweight_gen, 0);
  2500. spin_lock_irq(&ioc->lock);
  2501. ioc->autop_idx = AUTOP_INVALID;
  2502. ioc_refresh_params_disk(ioc, true, disk);
  2503. spin_unlock_irq(&ioc->lock);
  2504. /*
  2505. * rqos must be added before activation to allow ioc_pd_init() to
  2506. * lookup the ioc from q. This means that the rqos methods may get
  2507. * called before policy activation completion, can't assume that the
  2508. * target bio has an iocg associated and need to test for NULL iocg.
  2509. */
  2510. ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
  2511. if (ret)
  2512. goto err_free_ioc;
  2513. ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
  2514. if (ret)
  2515. goto err_del_qos;
  2516. return 0;
  2517. err_del_qos:
  2518. rq_qos_del(&ioc->rqos);
  2519. err_free_ioc:
  2520. free_percpu(ioc->pcpu_stat);
  2521. kfree(ioc);
  2522. return ret;
  2523. }
  2524. static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
  2525. {
  2526. struct ioc_cgrp *iocc;
  2527. iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
  2528. if (!iocc)
  2529. return NULL;
  2530. iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
  2531. return &iocc->cpd;
  2532. }
  2533. static void ioc_cpd_free(struct blkcg_policy_data *cpd)
  2534. {
  2535. kfree(container_of(cpd, struct ioc_cgrp, cpd));
  2536. }
  2537. static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
  2538. struct blkcg *blkcg, gfp_t gfp)
  2539. {
  2540. int levels = blkcg->css.cgroup->level + 1;
  2541. struct ioc_gq *iocg;
  2542. iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
  2543. disk->node_id);
  2544. if (!iocg)
  2545. return NULL;
  2546. iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
  2547. if (!iocg->pcpu_stat) {
  2548. kfree(iocg);
  2549. return NULL;
  2550. }
  2551. return &iocg->pd;
  2552. }
  2553. static void ioc_pd_init(struct blkg_policy_data *pd)
  2554. {
  2555. struct ioc_gq *iocg = pd_to_iocg(pd);
  2556. struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
  2557. struct ioc *ioc = q_to_ioc(blkg->q);
  2558. struct ioc_now now;
  2559. struct blkcg_gq *tblkg;
  2560. unsigned long flags;
  2561. ioc_now(ioc, &now);
  2562. iocg->ioc = ioc;
  2563. atomic64_set(&iocg->vtime, now.vnow);
  2564. atomic64_set(&iocg->done_vtime, now.vnow);
  2565. atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
  2566. INIT_LIST_HEAD(&iocg->active_list);
  2567. INIT_LIST_HEAD(&iocg->walk_list);
  2568. INIT_LIST_HEAD(&iocg->surplus_list);
  2569. iocg->hweight_active = WEIGHT_ONE;
  2570. iocg->hweight_inuse = WEIGHT_ONE;
  2571. init_waitqueue_head(&iocg->waitq);
  2572. hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  2573. iocg->waitq_timer.function = iocg_waitq_timer_fn;
  2574. iocg->level = blkg->blkcg->css.cgroup->level;
  2575. for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
  2576. struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
  2577. iocg->ancestors[tiocg->level] = tiocg;
  2578. }
  2579. spin_lock_irqsave(&ioc->lock, flags);
  2580. weight_updated(iocg, &now);
  2581. spin_unlock_irqrestore(&ioc->lock, flags);
  2582. }
  2583. static void ioc_pd_free(struct blkg_policy_data *pd)
  2584. {
  2585. struct ioc_gq *iocg = pd_to_iocg(pd);
  2586. struct ioc *ioc = iocg->ioc;
  2587. unsigned long flags;
  2588. if (ioc) {
  2589. spin_lock_irqsave(&ioc->lock, flags);
  2590. if (!list_empty(&iocg->active_list)) {
  2591. struct ioc_now now;
  2592. ioc_now(ioc, &now);
  2593. propagate_weights(iocg, 0, 0, false, &now);
  2594. list_del_init(&iocg->active_list);
  2595. }
  2596. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  2597. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  2598. spin_unlock_irqrestore(&ioc->lock, flags);
  2599. hrtimer_cancel(&iocg->waitq_timer);
  2600. }
  2601. free_percpu(iocg->pcpu_stat);
  2602. kfree(iocg);
  2603. }
  2604. static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
  2605. {
  2606. struct ioc_gq *iocg = pd_to_iocg(pd);
  2607. struct ioc *ioc = iocg->ioc;
  2608. if (!ioc->enabled)
  2609. return;
  2610. if (iocg->level == 0) {
  2611. unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
  2612. ioc->vtime_base_rate * 10000,
  2613. VTIME_PER_USEC);
  2614. seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
  2615. }
  2616. seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
  2617. if (blkcg_debug_stats)
  2618. seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
  2619. iocg->last_stat.wait_us,
  2620. iocg->last_stat.indebt_us,
  2621. iocg->last_stat.indelay_us);
  2622. }
  2623. static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2624. int off)
  2625. {
  2626. const char *dname = blkg_dev_name(pd->blkg);
  2627. struct ioc_gq *iocg = pd_to_iocg(pd);
  2628. if (dname && iocg->cfg_weight)
  2629. seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
  2630. return 0;
  2631. }
  2632. static int ioc_weight_show(struct seq_file *sf, void *v)
  2633. {
  2634. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2635. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2636. seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
  2637. blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
  2638. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2639. return 0;
  2640. }
  2641. static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
  2642. size_t nbytes, loff_t off)
  2643. {
  2644. struct blkcg *blkcg = css_to_blkcg(of_css(of));
  2645. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2646. struct blkg_conf_ctx ctx;
  2647. struct ioc_now now;
  2648. struct ioc_gq *iocg;
  2649. u32 v;
  2650. int ret;
  2651. if (!strchr(buf, ':')) {
  2652. struct blkcg_gq *blkg;
  2653. if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
  2654. return -EINVAL;
  2655. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2656. return -EINVAL;
  2657. spin_lock_irq(&blkcg->lock);
  2658. iocc->dfl_weight = v * WEIGHT_ONE;
  2659. hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
  2660. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2661. if (iocg) {
  2662. spin_lock(&iocg->ioc->lock);
  2663. ioc_now(iocg->ioc, &now);
  2664. weight_updated(iocg, &now);
  2665. spin_unlock(&iocg->ioc->lock);
  2666. }
  2667. }
  2668. spin_unlock_irq(&blkcg->lock);
  2669. return nbytes;
  2670. }
  2671. blkg_conf_init(&ctx, buf);
  2672. ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
  2673. if (ret)
  2674. goto err;
  2675. iocg = blkg_to_iocg(ctx.blkg);
  2676. if (!strncmp(ctx.body, "default", 7)) {
  2677. v = 0;
  2678. } else {
  2679. if (!sscanf(ctx.body, "%u", &v))
  2680. goto einval;
  2681. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2682. goto einval;
  2683. }
  2684. spin_lock(&iocg->ioc->lock);
  2685. iocg->cfg_weight = v * WEIGHT_ONE;
  2686. ioc_now(iocg->ioc, &now);
  2687. weight_updated(iocg, &now);
  2688. spin_unlock(&iocg->ioc->lock);
  2689. blkg_conf_exit(&ctx);
  2690. return nbytes;
  2691. einval:
  2692. ret = -EINVAL;
  2693. err:
  2694. blkg_conf_exit(&ctx);
  2695. return ret;
  2696. }
  2697. static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2698. int off)
  2699. {
  2700. const char *dname = blkg_dev_name(pd->blkg);
  2701. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2702. if (!dname)
  2703. return 0;
  2704. spin_lock(&ioc->lock);
  2705. seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
  2706. dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
  2707. ioc->params.qos[QOS_RPPM] / 10000,
  2708. ioc->params.qos[QOS_RPPM] % 10000 / 100,
  2709. ioc->params.qos[QOS_RLAT],
  2710. ioc->params.qos[QOS_WPPM] / 10000,
  2711. ioc->params.qos[QOS_WPPM] % 10000 / 100,
  2712. ioc->params.qos[QOS_WLAT],
  2713. ioc->params.qos[QOS_MIN] / 10000,
  2714. ioc->params.qos[QOS_MIN] % 10000 / 100,
  2715. ioc->params.qos[QOS_MAX] / 10000,
  2716. ioc->params.qos[QOS_MAX] % 10000 / 100);
  2717. spin_unlock(&ioc->lock);
  2718. return 0;
  2719. }
  2720. static int ioc_qos_show(struct seq_file *sf, void *v)
  2721. {
  2722. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2723. blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
  2724. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2725. return 0;
  2726. }
  2727. static const match_table_t qos_ctrl_tokens = {
  2728. { QOS_ENABLE, "enable=%u" },
  2729. { QOS_CTRL, "ctrl=%s" },
  2730. { NR_QOS_CTRL_PARAMS, NULL },
  2731. };
  2732. static const match_table_t qos_tokens = {
  2733. { QOS_RPPM, "rpct=%s" },
  2734. { QOS_RLAT, "rlat=%u" },
  2735. { QOS_WPPM, "wpct=%s" },
  2736. { QOS_WLAT, "wlat=%u" },
  2737. { QOS_MIN, "min=%s" },
  2738. { QOS_MAX, "max=%s" },
  2739. { NR_QOS_PARAMS, NULL },
  2740. };
  2741. static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  2742. size_t nbytes, loff_t off)
  2743. {
  2744. struct blkg_conf_ctx ctx;
  2745. struct gendisk *disk;
  2746. struct ioc *ioc;
  2747. u32 qos[NR_QOS_PARAMS];
  2748. bool enable, user;
  2749. char *body, *p;
  2750. int ret;
  2751. blkg_conf_init(&ctx, input);
  2752. ret = blkg_conf_open_bdev(&ctx);
  2753. if (ret)
  2754. goto err;
  2755. body = ctx.body;
  2756. disk = ctx.bdev->bd_disk;
  2757. if (!queue_is_mq(disk->queue)) {
  2758. ret = -EOPNOTSUPP;
  2759. goto err;
  2760. }
  2761. ioc = q_to_ioc(disk->queue);
  2762. if (!ioc) {
  2763. ret = blk_iocost_init(disk);
  2764. if (ret)
  2765. goto err;
  2766. ioc = q_to_ioc(disk->queue);
  2767. }
  2768. blk_mq_freeze_queue(disk->queue);
  2769. blk_mq_quiesce_queue(disk->queue);
  2770. spin_lock_irq(&ioc->lock);
  2771. memcpy(qos, ioc->params.qos, sizeof(qos));
  2772. enable = ioc->enabled;
  2773. user = ioc->user_qos_params;
  2774. while ((p = strsep(&body, " \t\n"))) {
  2775. substring_t args[MAX_OPT_ARGS];
  2776. char buf[32];
  2777. int tok;
  2778. s64 v;
  2779. if (!*p)
  2780. continue;
  2781. switch (match_token(p, qos_ctrl_tokens, args)) {
  2782. case QOS_ENABLE:
  2783. if (match_u64(&args[0], &v))
  2784. goto einval;
  2785. enable = v;
  2786. continue;
  2787. case QOS_CTRL:
  2788. match_strlcpy(buf, &args[0], sizeof(buf));
  2789. if (!strcmp(buf, "auto"))
  2790. user = false;
  2791. else if (!strcmp(buf, "user"))
  2792. user = true;
  2793. else
  2794. goto einval;
  2795. continue;
  2796. }
  2797. tok = match_token(p, qos_tokens, args);
  2798. switch (tok) {
  2799. case QOS_RPPM:
  2800. case QOS_WPPM:
  2801. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2802. sizeof(buf))
  2803. goto einval;
  2804. if (cgroup_parse_float(buf, 2, &v))
  2805. goto einval;
  2806. if (v < 0 || v > 10000)
  2807. goto einval;
  2808. qos[tok] = v * 100;
  2809. break;
  2810. case QOS_RLAT:
  2811. case QOS_WLAT:
  2812. if (match_u64(&args[0], &v))
  2813. goto einval;
  2814. qos[tok] = v;
  2815. break;
  2816. case QOS_MIN:
  2817. case QOS_MAX:
  2818. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2819. sizeof(buf))
  2820. goto einval;
  2821. if (cgroup_parse_float(buf, 2, &v))
  2822. goto einval;
  2823. if (v < 0)
  2824. goto einval;
  2825. qos[tok] = clamp_t(s64, v * 100,
  2826. VRATE_MIN_PPM, VRATE_MAX_PPM);
  2827. break;
  2828. default:
  2829. goto einval;
  2830. }
  2831. user = true;
  2832. }
  2833. if (qos[QOS_MIN] > qos[QOS_MAX])
  2834. goto einval;
  2835. if (enable && !ioc->enabled) {
  2836. blk_stat_enable_accounting(disk->queue);
  2837. blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2838. ioc->enabled = true;
  2839. } else if (!enable && ioc->enabled) {
  2840. blk_stat_disable_accounting(disk->queue);
  2841. blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2842. ioc->enabled = false;
  2843. }
  2844. if (user) {
  2845. memcpy(ioc->params.qos, qos, sizeof(qos));
  2846. ioc->user_qos_params = true;
  2847. } else {
  2848. ioc->user_qos_params = false;
  2849. }
  2850. ioc_refresh_params(ioc, true);
  2851. spin_unlock_irq(&ioc->lock);
  2852. if (enable)
  2853. wbt_disable_default(disk);
  2854. else
  2855. wbt_enable_default(disk);
  2856. blk_mq_unquiesce_queue(disk->queue);
  2857. blk_mq_unfreeze_queue(disk->queue);
  2858. blkg_conf_exit(&ctx);
  2859. return nbytes;
  2860. einval:
  2861. spin_unlock_irq(&ioc->lock);
  2862. blk_mq_unquiesce_queue(disk->queue);
  2863. blk_mq_unfreeze_queue(disk->queue);
  2864. ret = -EINVAL;
  2865. err:
  2866. blkg_conf_exit(&ctx);
  2867. return ret;
  2868. }
  2869. static u64 ioc_cost_model_prfill(struct seq_file *sf,
  2870. struct blkg_policy_data *pd, int off)
  2871. {
  2872. const char *dname = blkg_dev_name(pd->blkg);
  2873. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2874. u64 *u = ioc->params.i_lcoefs;
  2875. if (!dname)
  2876. return 0;
  2877. spin_lock(&ioc->lock);
  2878. seq_printf(sf, "%s ctrl=%s model=linear "
  2879. "rbps=%llu rseqiops=%llu rrandiops=%llu "
  2880. "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
  2881. dname, ioc->user_cost_model ? "user" : "auto",
  2882. u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  2883. u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
  2884. spin_unlock(&ioc->lock);
  2885. return 0;
  2886. }
  2887. static int ioc_cost_model_show(struct seq_file *sf, void *v)
  2888. {
  2889. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2890. blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
  2891. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2892. return 0;
  2893. }
  2894. static const match_table_t cost_ctrl_tokens = {
  2895. { COST_CTRL, "ctrl=%s" },
  2896. { COST_MODEL, "model=%s" },
  2897. { NR_COST_CTRL_PARAMS, NULL },
  2898. };
  2899. static const match_table_t i_lcoef_tokens = {
  2900. { I_LCOEF_RBPS, "rbps=%u" },
  2901. { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
  2902. { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
  2903. { I_LCOEF_WBPS, "wbps=%u" },
  2904. { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
  2905. { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
  2906. { NR_I_LCOEFS, NULL },
  2907. };
  2908. static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  2909. size_t nbytes, loff_t off)
  2910. {
  2911. struct blkg_conf_ctx ctx;
  2912. struct request_queue *q;
  2913. struct ioc *ioc;
  2914. u64 u[NR_I_LCOEFS];
  2915. bool user;
  2916. char *body, *p;
  2917. int ret;
  2918. blkg_conf_init(&ctx, input);
  2919. ret = blkg_conf_open_bdev(&ctx);
  2920. if (ret)
  2921. goto err;
  2922. body = ctx.body;
  2923. q = bdev_get_queue(ctx.bdev);
  2924. if (!queue_is_mq(q)) {
  2925. ret = -EOPNOTSUPP;
  2926. goto err;
  2927. }
  2928. ioc = q_to_ioc(q);
  2929. if (!ioc) {
  2930. ret = blk_iocost_init(ctx.bdev->bd_disk);
  2931. if (ret)
  2932. goto err;
  2933. ioc = q_to_ioc(q);
  2934. }
  2935. blk_mq_freeze_queue(q);
  2936. blk_mq_quiesce_queue(q);
  2937. spin_lock_irq(&ioc->lock);
  2938. memcpy(u, ioc->params.i_lcoefs, sizeof(u));
  2939. user = ioc->user_cost_model;
  2940. while ((p = strsep(&body, " \t\n"))) {
  2941. substring_t args[MAX_OPT_ARGS];
  2942. char buf[32];
  2943. int tok;
  2944. u64 v;
  2945. if (!*p)
  2946. continue;
  2947. switch (match_token(p, cost_ctrl_tokens, args)) {
  2948. case COST_CTRL:
  2949. match_strlcpy(buf, &args[0], sizeof(buf));
  2950. if (!strcmp(buf, "auto"))
  2951. user = false;
  2952. else if (!strcmp(buf, "user"))
  2953. user = true;
  2954. else
  2955. goto einval;
  2956. continue;
  2957. case COST_MODEL:
  2958. match_strlcpy(buf, &args[0], sizeof(buf));
  2959. if (strcmp(buf, "linear"))
  2960. goto einval;
  2961. continue;
  2962. }
  2963. tok = match_token(p, i_lcoef_tokens, args);
  2964. if (tok == NR_I_LCOEFS)
  2965. goto einval;
  2966. if (match_u64(&args[0], &v))
  2967. goto einval;
  2968. u[tok] = v;
  2969. user = true;
  2970. }
  2971. if (user) {
  2972. memcpy(ioc->params.i_lcoefs, u, sizeof(u));
  2973. ioc->user_cost_model = true;
  2974. } else {
  2975. ioc->user_cost_model = false;
  2976. }
  2977. ioc_refresh_params(ioc, true);
  2978. spin_unlock_irq(&ioc->lock);
  2979. blk_mq_unquiesce_queue(q);
  2980. blk_mq_unfreeze_queue(q);
  2981. blkg_conf_exit(&ctx);
  2982. return nbytes;
  2983. einval:
  2984. spin_unlock_irq(&ioc->lock);
  2985. blk_mq_unquiesce_queue(q);
  2986. blk_mq_unfreeze_queue(q);
  2987. ret = -EINVAL;
  2988. err:
  2989. blkg_conf_exit(&ctx);
  2990. return ret;
  2991. }
  2992. static struct cftype ioc_files[] = {
  2993. {
  2994. .name = "weight",
  2995. .flags = CFTYPE_NOT_ON_ROOT,
  2996. .seq_show = ioc_weight_show,
  2997. .write = ioc_weight_write,
  2998. },
  2999. {
  3000. .name = "cost.qos",
  3001. .flags = CFTYPE_ONLY_ON_ROOT,
  3002. .seq_show = ioc_qos_show,
  3003. .write = ioc_qos_write,
  3004. },
  3005. {
  3006. .name = "cost.model",
  3007. .flags = CFTYPE_ONLY_ON_ROOT,
  3008. .seq_show = ioc_cost_model_show,
  3009. .write = ioc_cost_model_write,
  3010. },
  3011. {}
  3012. };
  3013. static struct blkcg_policy blkcg_policy_iocost = {
  3014. .dfl_cftypes = ioc_files,
  3015. .cpd_alloc_fn = ioc_cpd_alloc,
  3016. .cpd_free_fn = ioc_cpd_free,
  3017. .pd_alloc_fn = ioc_pd_alloc,
  3018. .pd_init_fn = ioc_pd_init,
  3019. .pd_free_fn = ioc_pd_free,
  3020. .pd_stat_fn = ioc_pd_stat,
  3021. };
  3022. static int __init ioc_init(void)
  3023. {
  3024. return blkcg_policy_register(&blkcg_policy_iocost);
  3025. }
  3026. static void __exit ioc_exit(void)
  3027. {
  3028. blkcg_policy_unregister(&blkcg_policy_iocost);
  3029. }
  3030. module_init(ioc_init);
  3031. module_exit(ioc_exit);