blk-wbt.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * buffered writeback throttling. loosely based on CoDel. We can't drop
  4. * packets for IO scheduling, so the logic is something like this:
  5. *
  6. * - Monitor latencies in a defined window of time.
  7. * - If the minimum latency in the above window exceeds some target, increment
  8. * scaling step and scale down queue depth by a factor of 2x. The monitoring
  9. * window is then shrunk to 100 / sqrt(scaling step + 1).
  10. * - For any window where we don't have solid data on what the latencies
  11. * look like, retain status quo.
  12. * - If latencies look good, decrement scaling step.
  13. * - If we're only doing writes, allow the scaling step to go negative. This
  14. * will temporarily boost write performance, snapping back to a stable
  15. * scaling step of 0 if reads show up or the heavy writers finish. Unlike
  16. * positive scaling steps where we shrink the monitoring window, a negative
  17. * scaling step retains the default step==0 window size.
  18. *
  19. * Copyright (C) 2016 Jens Axboe
  20. *
  21. */
  22. #include <linux/kernel.h>
  23. #include <linux/blk_types.h>
  24. #include <linux/slab.h>
  25. #include <linux/backing-dev.h>
  26. #include <linux/swap.h>
  27. #include "blk-stat.h"
  28. #include "blk-wbt.h"
  29. #include "blk-rq-qos.h"
  30. #include "elevator.h"
  31. #include "blk.h"
  32. #define CREATE_TRACE_POINTS
  33. #include <trace/events/wbt.h>
  34. enum wbt_flags {
  35. WBT_TRACKED = 1, /* write, tracked for throttling */
  36. WBT_READ = 2, /* read */
  37. WBT_SWAP = 4, /* write, from swap_writepage() */
  38. WBT_DISCARD = 8, /* discard */
  39. WBT_NR_BITS = 4, /* number of bits */
  40. };
  41. enum {
  42. WBT_RWQ_BG = 0,
  43. WBT_RWQ_SWAP,
  44. WBT_RWQ_DISCARD,
  45. WBT_NUM_RWQ,
  46. };
  47. /*
  48. * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
  49. * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
  50. * to WBT_STATE_OFF/ON_MANUAL.
  51. */
  52. enum {
  53. WBT_STATE_ON_DEFAULT = 1, /* on by default */
  54. WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
  55. WBT_STATE_OFF_DEFAULT = 3, /* off by default */
  56. WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
  57. };
  58. struct rq_wb {
  59. /*
  60. * Settings that govern how we throttle
  61. */
  62. unsigned int wb_background; /* background writeback */
  63. unsigned int wb_normal; /* normal writeback */
  64. short enable_state; /* WBT_STATE_* */
  65. /*
  66. * Number of consecutive periods where we don't have enough
  67. * information to make a firm scale up/down decision.
  68. */
  69. unsigned int unknown_cnt;
  70. u64 win_nsec; /* default window size */
  71. u64 cur_win_nsec; /* current window size */
  72. struct blk_stat_callback *cb;
  73. u64 sync_issue;
  74. void *sync_cookie;
  75. unsigned long last_issue; /* last non-throttled issue */
  76. unsigned long last_comp; /* last non-throttled comp */
  77. unsigned long min_lat_nsec;
  78. struct rq_qos rqos;
  79. struct rq_wait rq_wait[WBT_NUM_RWQ];
  80. struct rq_depth rq_depth;
  81. };
  82. static inline struct rq_wb *RQWB(struct rq_qos *rqos)
  83. {
  84. return container_of(rqos, struct rq_wb, rqos);
  85. }
  86. static inline void wbt_clear_state(struct request *rq)
  87. {
  88. rq->wbt_flags = 0;
  89. }
  90. static inline enum wbt_flags wbt_flags(struct request *rq)
  91. {
  92. return rq->wbt_flags;
  93. }
  94. static inline bool wbt_is_tracked(struct request *rq)
  95. {
  96. return rq->wbt_flags & WBT_TRACKED;
  97. }
  98. static inline bool wbt_is_read(struct request *rq)
  99. {
  100. return rq->wbt_flags & WBT_READ;
  101. }
  102. enum {
  103. /*
  104. * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
  105. * from here depending on device stats
  106. */
  107. RWB_DEF_DEPTH = 16,
  108. /*
  109. * 100msec window
  110. */
  111. RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
  112. /*
  113. * Disregard stats, if we don't meet this minimum
  114. */
  115. RWB_MIN_WRITE_SAMPLES = 3,
  116. /*
  117. * If we have this number of consecutive windows with not enough
  118. * information to scale up or down, scale up.
  119. */
  120. RWB_UNKNOWN_BUMP = 5,
  121. };
  122. static inline bool rwb_enabled(struct rq_wb *rwb)
  123. {
  124. return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
  125. rwb->enable_state != WBT_STATE_OFF_MANUAL;
  126. }
  127. static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  128. {
  129. if (rwb_enabled(rwb)) {
  130. const unsigned long cur = jiffies;
  131. if (cur != *var)
  132. *var = cur;
  133. }
  134. }
  135. /*
  136. * If a task was rate throttled in balance_dirty_pages() within the last
  137. * second or so, use that to indicate a higher cleaning rate.
  138. */
  139. static bool wb_recent_wait(struct rq_wb *rwb)
  140. {
  141. struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
  142. return time_before(jiffies, bdi->last_bdp_sleep + HZ);
  143. }
  144. static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
  145. enum wbt_flags wb_acct)
  146. {
  147. if (wb_acct & WBT_SWAP)
  148. return &rwb->rq_wait[WBT_RWQ_SWAP];
  149. else if (wb_acct & WBT_DISCARD)
  150. return &rwb->rq_wait[WBT_RWQ_DISCARD];
  151. return &rwb->rq_wait[WBT_RWQ_BG];
  152. }
  153. static void rwb_wake_all(struct rq_wb *rwb)
  154. {
  155. int i;
  156. for (i = 0; i < WBT_NUM_RWQ; i++) {
  157. struct rq_wait *rqw = &rwb->rq_wait[i];
  158. if (wq_has_sleeper(&rqw->wait))
  159. wake_up_all(&rqw->wait);
  160. }
  161. }
  162. static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
  163. enum wbt_flags wb_acct)
  164. {
  165. int inflight, limit;
  166. inflight = atomic_dec_return(&rqw->inflight);
  167. /*
  168. * For discards, our limit is always the background. For writes, if
  169. * the device does write back caching, drop further down before we
  170. * wake people up.
  171. */
  172. if (wb_acct & WBT_DISCARD)
  173. limit = rwb->wb_background;
  174. else if (blk_queue_write_cache(rwb->rqos.disk->queue) &&
  175. !wb_recent_wait(rwb))
  176. limit = 0;
  177. else
  178. limit = rwb->wb_normal;
  179. /*
  180. * Don't wake anyone up if we are above the normal limit.
  181. */
  182. if (inflight && inflight >= limit)
  183. return;
  184. if (wq_has_sleeper(&rqw->wait)) {
  185. int diff = limit - inflight;
  186. if (!inflight || diff >= rwb->wb_background / 2)
  187. wake_up_all(&rqw->wait);
  188. }
  189. }
  190. static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
  191. {
  192. struct rq_wb *rwb = RQWB(rqos);
  193. struct rq_wait *rqw;
  194. if (!(wb_acct & WBT_TRACKED))
  195. return;
  196. rqw = get_rq_wait(rwb, wb_acct);
  197. wbt_rqw_done(rwb, rqw, wb_acct);
  198. }
  199. /*
  200. * Called on completion of a request. Note that it's also called when
  201. * a request is merged, when the request gets freed.
  202. */
  203. static void wbt_done(struct rq_qos *rqos, struct request *rq)
  204. {
  205. struct rq_wb *rwb = RQWB(rqos);
  206. if (!wbt_is_tracked(rq)) {
  207. if (rwb->sync_cookie == rq) {
  208. rwb->sync_issue = 0;
  209. rwb->sync_cookie = NULL;
  210. }
  211. if (wbt_is_read(rq))
  212. wb_timestamp(rwb, &rwb->last_comp);
  213. } else {
  214. WARN_ON_ONCE(rq == rwb->sync_cookie);
  215. __wbt_done(rqos, wbt_flags(rq));
  216. }
  217. wbt_clear_state(rq);
  218. }
  219. static inline bool stat_sample_valid(struct blk_rq_stat *stat)
  220. {
  221. /*
  222. * We need at least one read sample, and a minimum of
  223. * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
  224. * that it's writes impacting us, and not just some sole read on
  225. * a device that is in a lower power state.
  226. */
  227. return (stat[READ].nr_samples >= 1 &&
  228. stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
  229. }
  230. static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
  231. {
  232. u64 issue = READ_ONCE(rwb->sync_issue);
  233. if (!issue || !rwb->sync_cookie)
  234. return 0;
  235. return blk_time_get_ns() - issue;
  236. }
  237. static inline unsigned int wbt_inflight(struct rq_wb *rwb)
  238. {
  239. unsigned int i, ret = 0;
  240. for (i = 0; i < WBT_NUM_RWQ; i++)
  241. ret += atomic_read(&rwb->rq_wait[i].inflight);
  242. return ret;
  243. }
  244. enum {
  245. LAT_OK = 1,
  246. LAT_UNKNOWN,
  247. LAT_UNKNOWN_WRITES,
  248. LAT_EXCEEDED,
  249. };
  250. static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
  251. {
  252. struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
  253. struct rq_depth *rqd = &rwb->rq_depth;
  254. u64 thislat;
  255. /*
  256. * If our stored sync issue exceeds the window size, or it
  257. * exceeds our min target AND we haven't logged any entries,
  258. * flag the latency as exceeded. wbt works off completion latencies,
  259. * but for a flooded device, a single sync IO can take a long time
  260. * to complete after being issued. If this time exceeds our
  261. * monitoring window AND we didn't see any other completions in that
  262. * window, then count that sync IO as a violation of the latency.
  263. */
  264. thislat = rwb_sync_issue_lat(rwb);
  265. if (thislat > rwb->cur_win_nsec ||
  266. (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
  267. trace_wbt_lat(bdi, thislat);
  268. return LAT_EXCEEDED;
  269. }
  270. /*
  271. * No read/write mix, if stat isn't valid
  272. */
  273. if (!stat_sample_valid(stat)) {
  274. /*
  275. * If we had writes in this stat window and the window is
  276. * current, we're only doing writes. If a task recently
  277. * waited or still has writes in flights, consider us doing
  278. * just writes as well.
  279. */
  280. if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
  281. wbt_inflight(rwb))
  282. return LAT_UNKNOWN_WRITES;
  283. return LAT_UNKNOWN;
  284. }
  285. /*
  286. * If the 'min' latency exceeds our target, step down.
  287. */
  288. if (stat[READ].min > rwb->min_lat_nsec) {
  289. trace_wbt_lat(bdi, stat[READ].min);
  290. trace_wbt_stat(bdi, stat);
  291. return LAT_EXCEEDED;
  292. }
  293. if (rqd->scale_step)
  294. trace_wbt_stat(bdi, stat);
  295. return LAT_OK;
  296. }
  297. static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
  298. {
  299. struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
  300. struct rq_depth *rqd = &rwb->rq_depth;
  301. trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
  302. rwb->wb_background, rwb->wb_normal, rqd->max_depth);
  303. }
  304. static void calc_wb_limits(struct rq_wb *rwb)
  305. {
  306. if (rwb->min_lat_nsec == 0) {
  307. rwb->wb_normal = rwb->wb_background = 0;
  308. } else if (rwb->rq_depth.max_depth <= 2) {
  309. rwb->wb_normal = rwb->rq_depth.max_depth;
  310. rwb->wb_background = 1;
  311. } else {
  312. rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
  313. rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
  314. }
  315. }
  316. static void scale_up(struct rq_wb *rwb)
  317. {
  318. if (!rq_depth_scale_up(&rwb->rq_depth))
  319. return;
  320. calc_wb_limits(rwb);
  321. rwb->unknown_cnt = 0;
  322. rwb_wake_all(rwb);
  323. rwb_trace_step(rwb, tracepoint_string("scale up"));
  324. }
  325. static void scale_down(struct rq_wb *rwb, bool hard_throttle)
  326. {
  327. if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
  328. return;
  329. calc_wb_limits(rwb);
  330. rwb->unknown_cnt = 0;
  331. rwb_trace_step(rwb, tracepoint_string("scale down"));
  332. }
  333. static void rwb_arm_timer(struct rq_wb *rwb)
  334. {
  335. struct rq_depth *rqd = &rwb->rq_depth;
  336. if (rqd->scale_step > 0) {
  337. /*
  338. * We should speed this up, using some variant of a fast
  339. * integer inverse square root calculation. Since we only do
  340. * this for every window expiration, it's not a huge deal,
  341. * though.
  342. */
  343. rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
  344. int_sqrt((rqd->scale_step + 1) << 8));
  345. } else {
  346. /*
  347. * For step < 0, we don't want to increase/decrease the
  348. * window size.
  349. */
  350. rwb->cur_win_nsec = rwb->win_nsec;
  351. }
  352. blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
  353. }
  354. static void wb_timer_fn(struct blk_stat_callback *cb)
  355. {
  356. struct rq_wb *rwb = cb->data;
  357. struct rq_depth *rqd = &rwb->rq_depth;
  358. unsigned int inflight = wbt_inflight(rwb);
  359. int status;
  360. if (!rwb->rqos.disk)
  361. return;
  362. status = latency_exceeded(rwb, cb->stat);
  363. trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);
  364. /*
  365. * If we exceeded the latency target, step down. If we did not,
  366. * step one level up. If we don't know enough to say either exceeded
  367. * or ok, then don't do anything.
  368. */
  369. switch (status) {
  370. case LAT_EXCEEDED:
  371. scale_down(rwb, true);
  372. break;
  373. case LAT_OK:
  374. scale_up(rwb);
  375. break;
  376. case LAT_UNKNOWN_WRITES:
  377. /*
  378. * We started a the center step, but don't have a valid
  379. * read/write sample, but we do have writes going on.
  380. * Allow step to go negative, to increase write perf.
  381. */
  382. scale_up(rwb);
  383. break;
  384. case LAT_UNKNOWN:
  385. if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
  386. break;
  387. /*
  388. * We get here when previously scaled reduced depth, and we
  389. * currently don't have a valid read/write sample. For that
  390. * case, slowly return to center state (step == 0).
  391. */
  392. if (rqd->scale_step > 0)
  393. scale_up(rwb);
  394. else if (rqd->scale_step < 0)
  395. scale_down(rwb, false);
  396. break;
  397. default:
  398. break;
  399. }
  400. /*
  401. * Re-arm timer, if we have IO in flight
  402. */
  403. if (rqd->scale_step || inflight)
  404. rwb_arm_timer(rwb);
  405. }
  406. static void wbt_update_limits(struct rq_wb *rwb)
  407. {
  408. struct rq_depth *rqd = &rwb->rq_depth;
  409. rqd->scale_step = 0;
  410. rqd->scaled_max = false;
  411. rq_depth_calc_max_depth(rqd);
  412. calc_wb_limits(rwb);
  413. rwb_wake_all(rwb);
  414. }
  415. bool wbt_disabled(struct request_queue *q)
  416. {
  417. struct rq_qos *rqos = wbt_rq_qos(q);
  418. return !rqos || !rwb_enabled(RQWB(rqos));
  419. }
  420. u64 wbt_get_min_lat(struct request_queue *q)
  421. {
  422. struct rq_qos *rqos = wbt_rq_qos(q);
  423. if (!rqos)
  424. return 0;
  425. return RQWB(rqos)->min_lat_nsec;
  426. }
  427. void wbt_set_min_lat(struct request_queue *q, u64 val)
  428. {
  429. struct rq_qos *rqos = wbt_rq_qos(q);
  430. if (!rqos)
  431. return;
  432. RQWB(rqos)->min_lat_nsec = val;
  433. if (val)
  434. RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
  435. else
  436. RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
  437. wbt_update_limits(RQWB(rqos));
  438. }
  439. static bool close_io(struct rq_wb *rwb)
  440. {
  441. const unsigned long now = jiffies;
  442. return time_before(now, rwb->last_issue + HZ / 10) ||
  443. time_before(now, rwb->last_comp + HZ / 10);
  444. }
  445. #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP)
  446. static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
  447. {
  448. unsigned int limit;
  449. if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
  450. return rwb->wb_background;
  451. /*
  452. * At this point we know it's a buffered write. If this is
  453. * swap trying to free memory, or REQ_SYNC is set, then
  454. * it's WB_SYNC_ALL writeback, and we'll use the max limit for
  455. * that. If the write is marked as a background write, then use
  456. * the idle limit, or go to normal if we haven't had competing
  457. * IO for a bit.
  458. */
  459. if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb))
  460. limit = rwb->rq_depth.max_depth;
  461. else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
  462. /*
  463. * If less than 100ms since we completed unrelated IO,
  464. * limit us to half the depth for background writeback.
  465. */
  466. limit = rwb->wb_background;
  467. } else
  468. limit = rwb->wb_normal;
  469. return limit;
  470. }
  471. struct wbt_wait_data {
  472. struct rq_wb *rwb;
  473. enum wbt_flags wb_acct;
  474. blk_opf_t opf;
  475. };
  476. static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
  477. {
  478. struct wbt_wait_data *data = private_data;
  479. return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
  480. }
  481. static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
  482. {
  483. struct wbt_wait_data *data = private_data;
  484. wbt_rqw_done(data->rwb, rqw, data->wb_acct);
  485. }
  486. /*
  487. * Block if we will exceed our limit, or if we are currently waiting for
  488. * the timer to kick off queuing again.
  489. */
  490. static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
  491. blk_opf_t opf)
  492. {
  493. struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
  494. struct wbt_wait_data data = {
  495. .rwb = rwb,
  496. .wb_acct = wb_acct,
  497. .opf = opf,
  498. };
  499. rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
  500. }
  501. static inline bool wbt_should_throttle(struct bio *bio)
  502. {
  503. switch (bio_op(bio)) {
  504. case REQ_OP_WRITE:
  505. /*
  506. * Don't throttle WRITE_ODIRECT
  507. */
  508. if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
  509. (REQ_SYNC | REQ_IDLE))
  510. return false;
  511. fallthrough;
  512. case REQ_OP_DISCARD:
  513. return true;
  514. default:
  515. return false;
  516. }
  517. }
  518. static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
  519. {
  520. enum wbt_flags flags = 0;
  521. if (!rwb_enabled(rwb))
  522. return 0;
  523. if (bio_op(bio) == REQ_OP_READ) {
  524. flags = WBT_READ;
  525. } else if (wbt_should_throttle(bio)) {
  526. if (bio->bi_opf & REQ_SWAP)
  527. flags |= WBT_SWAP;
  528. if (bio_op(bio) == REQ_OP_DISCARD)
  529. flags |= WBT_DISCARD;
  530. flags |= WBT_TRACKED;
  531. }
  532. return flags;
  533. }
  534. static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
  535. {
  536. struct rq_wb *rwb = RQWB(rqos);
  537. enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
  538. __wbt_done(rqos, flags);
  539. }
  540. /*
  541. * May sleep, if we have exceeded the writeback limits. Caller can pass
  542. * in an irq held spinlock, if it holds one when calling this function.
  543. * If we do sleep, we'll release and re-grab it.
  544. */
  545. static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
  546. {
  547. struct rq_wb *rwb = RQWB(rqos);
  548. enum wbt_flags flags;
  549. flags = bio_to_wbt_flags(rwb, bio);
  550. if (!(flags & WBT_TRACKED)) {
  551. if (flags & WBT_READ)
  552. wb_timestamp(rwb, &rwb->last_issue);
  553. return;
  554. }
  555. __wbt_wait(rwb, flags, bio->bi_opf);
  556. if (!blk_stat_is_active(rwb->cb))
  557. rwb_arm_timer(rwb);
  558. }
  559. static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
  560. {
  561. struct rq_wb *rwb = RQWB(rqos);
  562. rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
  563. }
  564. static void wbt_issue(struct rq_qos *rqos, struct request *rq)
  565. {
  566. struct rq_wb *rwb = RQWB(rqos);
  567. if (!rwb_enabled(rwb))
  568. return;
  569. /*
  570. * Track sync issue, in case it takes a long time to complete. Allows us
  571. * to react quicker, if a sync IO takes a long time to complete. Note
  572. * that this is just a hint. The request can go away when it completes,
  573. * so it's important we never dereference it. We only use the address to
  574. * compare with, which is why we store the sync_issue time locally.
  575. */
  576. if (wbt_is_read(rq) && !rwb->sync_issue) {
  577. rwb->sync_cookie = rq;
  578. rwb->sync_issue = rq->io_start_time_ns;
  579. }
  580. }
  581. static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
  582. {
  583. struct rq_wb *rwb = RQWB(rqos);
  584. if (!rwb_enabled(rwb))
  585. return;
  586. if (rq == rwb->sync_cookie) {
  587. rwb->sync_issue = 0;
  588. rwb->sync_cookie = NULL;
  589. }
  590. }
  591. /*
  592. * Enable wbt if defaults are configured that way
  593. */
  594. void wbt_enable_default(struct gendisk *disk)
  595. {
  596. struct request_queue *q = disk->queue;
  597. struct rq_qos *rqos;
  598. bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
  599. if (q->elevator &&
  600. test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
  601. enable = false;
  602. /* Throttling already enabled? */
  603. rqos = wbt_rq_qos(q);
  604. if (rqos) {
  605. if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
  606. RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
  607. return;
  608. }
  609. /* Queue not registered? Maybe shutting down... */
  610. if (!blk_queue_registered(q))
  611. return;
  612. if (queue_is_mq(q) && enable)
  613. wbt_init(disk);
  614. }
  615. EXPORT_SYMBOL_GPL(wbt_enable_default);
  616. u64 wbt_default_latency_nsec(struct request_queue *q)
  617. {
  618. /*
  619. * We default to 2msec for non-rotational storage, and 75msec
  620. * for rotational storage.
  621. */
  622. if (blk_queue_nonrot(q))
  623. return 2000000ULL;
  624. else
  625. return 75000000ULL;
  626. }
  627. static int wbt_data_dir(const struct request *rq)
  628. {
  629. const enum req_op op = req_op(rq);
  630. if (op == REQ_OP_READ)
  631. return READ;
  632. else if (op_is_write(op))
  633. return WRITE;
  634. /* don't account */
  635. return -1;
  636. }
  637. static void wbt_queue_depth_changed(struct rq_qos *rqos)
  638. {
  639. RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
  640. wbt_update_limits(RQWB(rqos));
  641. }
  642. static void wbt_exit(struct rq_qos *rqos)
  643. {
  644. struct rq_wb *rwb = RQWB(rqos);
  645. blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
  646. blk_stat_free_callback(rwb->cb);
  647. kfree(rwb);
  648. }
  649. /*
  650. * Disable wbt, if enabled by default.
  651. */
  652. void wbt_disable_default(struct gendisk *disk)
  653. {
  654. struct rq_qos *rqos = wbt_rq_qos(disk->queue);
  655. struct rq_wb *rwb;
  656. if (!rqos)
  657. return;
  658. rwb = RQWB(rqos);
  659. if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
  660. blk_stat_deactivate(rwb->cb);
  661. rwb->enable_state = WBT_STATE_OFF_DEFAULT;
  662. }
  663. }
  664. EXPORT_SYMBOL_GPL(wbt_disable_default);
  665. #ifdef CONFIG_BLK_DEBUG_FS
  666. static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
  667. {
  668. struct rq_qos *rqos = data;
  669. struct rq_wb *rwb = RQWB(rqos);
  670. seq_printf(m, "%llu\n", rwb->cur_win_nsec);
  671. return 0;
  672. }
  673. static int wbt_enabled_show(void *data, struct seq_file *m)
  674. {
  675. struct rq_qos *rqos = data;
  676. struct rq_wb *rwb = RQWB(rqos);
  677. seq_printf(m, "%d\n", rwb->enable_state);
  678. return 0;
  679. }
  680. static int wbt_id_show(void *data, struct seq_file *m)
  681. {
  682. struct rq_qos *rqos = data;
  683. seq_printf(m, "%u\n", rqos->id);
  684. return 0;
  685. }
  686. static int wbt_inflight_show(void *data, struct seq_file *m)
  687. {
  688. struct rq_qos *rqos = data;
  689. struct rq_wb *rwb = RQWB(rqos);
  690. int i;
  691. for (i = 0; i < WBT_NUM_RWQ; i++)
  692. seq_printf(m, "%d: inflight %d\n", i,
  693. atomic_read(&rwb->rq_wait[i].inflight));
  694. return 0;
  695. }
  696. static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
  697. {
  698. struct rq_qos *rqos = data;
  699. struct rq_wb *rwb = RQWB(rqos);
  700. seq_printf(m, "%lu\n", rwb->min_lat_nsec);
  701. return 0;
  702. }
  703. static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
  704. {
  705. struct rq_qos *rqos = data;
  706. struct rq_wb *rwb = RQWB(rqos);
  707. seq_printf(m, "%u\n", rwb->unknown_cnt);
  708. return 0;
  709. }
  710. static int wbt_normal_show(void *data, struct seq_file *m)
  711. {
  712. struct rq_qos *rqos = data;
  713. struct rq_wb *rwb = RQWB(rqos);
  714. seq_printf(m, "%u\n", rwb->wb_normal);
  715. return 0;
  716. }
  717. static int wbt_background_show(void *data, struct seq_file *m)
  718. {
  719. struct rq_qos *rqos = data;
  720. struct rq_wb *rwb = RQWB(rqos);
  721. seq_printf(m, "%u\n", rwb->wb_background);
  722. return 0;
  723. }
  724. static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
  725. {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
  726. {"enabled", 0400, wbt_enabled_show},
  727. {"id", 0400, wbt_id_show},
  728. {"inflight", 0400, wbt_inflight_show},
  729. {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
  730. {"unknown_cnt", 0400, wbt_unknown_cnt_show},
  731. {"wb_normal", 0400, wbt_normal_show},
  732. {"wb_background", 0400, wbt_background_show},
  733. {},
  734. };
  735. #endif
  736. static const struct rq_qos_ops wbt_rqos_ops = {
  737. .throttle = wbt_wait,
  738. .issue = wbt_issue,
  739. .track = wbt_track,
  740. .requeue = wbt_requeue,
  741. .done = wbt_done,
  742. .cleanup = wbt_cleanup,
  743. .queue_depth_changed = wbt_queue_depth_changed,
  744. .exit = wbt_exit,
  745. #ifdef CONFIG_BLK_DEBUG_FS
  746. .debugfs_attrs = wbt_debugfs_attrs,
  747. #endif
  748. };
  749. int wbt_init(struct gendisk *disk)
  750. {
  751. struct request_queue *q = disk->queue;
  752. struct rq_wb *rwb;
  753. int i;
  754. int ret;
  755. rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
  756. if (!rwb)
  757. return -ENOMEM;
  758. rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
  759. if (!rwb->cb) {
  760. kfree(rwb);
  761. return -ENOMEM;
  762. }
  763. for (i = 0; i < WBT_NUM_RWQ; i++)
  764. rq_wait_init(&rwb->rq_wait[i]);
  765. rwb->last_comp = rwb->last_issue = jiffies;
  766. rwb->win_nsec = RWB_WINDOW_NSEC;
  767. rwb->enable_state = WBT_STATE_ON_DEFAULT;
  768. rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
  769. rwb->min_lat_nsec = wbt_default_latency_nsec(q);
  770. rwb->rq_depth.queue_depth = blk_queue_depth(q);
  771. wbt_update_limits(rwb);
  772. /*
  773. * Assign rwb and add the stats callback.
  774. */
  775. mutex_lock(&q->rq_qos_mutex);
  776. ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
  777. mutex_unlock(&q->rq_qos_mutex);
  778. if (ret)
  779. goto err_free;
  780. blk_stat_add_callback(q, rwb->cb);
  781. return 0;
  782. err_free:
  783. blk_stat_free_callback(rwb->cb);
  784. kfree(rwb);
  785. return ret;
  786. }