blk-mq-sched.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * blk-mq scheduling framework
  4. *
  5. * Copyright (C) 2016 Jens Axboe
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/module.h>
  9. #include <linux/list_sort.h>
  10. #include <trace/events/block.h>
  11. #include "blk.h"
  12. #include "blk-mq.h"
  13. #include "blk-mq-debugfs.h"
  14. #include "blk-mq-sched.h"
  15. #include "blk-wbt.h"
  16. /*
  17. * Mark a hardware queue as needing a restart.
  18. */
  19. void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  20. {
  21. if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  22. return;
  23. set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  24. }
  25. EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
  26. void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  27. {
  28. clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  29. /*
  30. * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
  31. * in blk_mq_run_hw_queue(). Its pair is the barrier in
  32. * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
  33. * meantime new request added to hctx->dispatch is missed to check in
  34. * blk_mq_run_hw_queue().
  35. */
  36. smp_mb();
  37. blk_mq_run_hw_queue(hctx, true);
  38. }
  39. static int sched_rq_cmp(void *priv, const struct list_head *a,
  40. const struct list_head *b)
  41. {
  42. struct request *rqa = container_of(a, struct request, queuelist);
  43. struct request *rqb = container_of(b, struct request, queuelist);
  44. return rqa->mq_hctx > rqb->mq_hctx;
  45. }
  46. static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
  47. {
  48. struct blk_mq_hw_ctx *hctx =
  49. list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
  50. struct request *rq;
  51. LIST_HEAD(hctx_list);
  52. unsigned int count = 0;
  53. list_for_each_entry(rq, rq_list, queuelist) {
  54. if (rq->mq_hctx != hctx) {
  55. list_cut_before(&hctx_list, rq_list, &rq->queuelist);
  56. goto dispatch;
  57. }
  58. count++;
  59. }
  60. list_splice_tail_init(rq_list, &hctx_list);
  61. dispatch:
  62. return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
  63. }
  64. #define BLK_MQ_BUDGET_DELAY 3 /* ms units */
  65. /*
  66. * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  67. * its queue by itself in its completion handler, so we don't need to
  68. * restart queue if .get_budget() fails to get the budget.
  69. *
  70. * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
  71. * be run again. This is necessary to avoid starving flushes.
  72. */
  73. static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  74. {
  75. struct request_queue *q = hctx->queue;
  76. struct elevator_queue *e = q->elevator;
  77. bool multi_hctxs = false, run_queue = false;
  78. bool dispatched = false, busy = false;
  79. unsigned int max_dispatch;
  80. LIST_HEAD(rq_list);
  81. int count = 0;
  82. if (hctx->dispatch_busy)
  83. max_dispatch = 1;
  84. else
  85. max_dispatch = hctx->queue->nr_requests;
  86. do {
  87. struct request *rq;
  88. int budget_token;
  89. if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
  90. break;
  91. if (!list_empty_careful(&hctx->dispatch)) {
  92. busy = true;
  93. break;
  94. }
  95. budget_token = blk_mq_get_dispatch_budget(q);
  96. if (budget_token < 0)
  97. break;
  98. rq = e->type->ops.dispatch_request(hctx);
  99. if (!rq) {
  100. blk_mq_put_dispatch_budget(q, budget_token);
  101. /*
  102. * We're releasing without dispatching. Holding the
  103. * budget could have blocked any "hctx"s with the
  104. * same queue and if we didn't dispatch then there's
  105. * no guarantee anyone will kick the queue. Kick it
  106. * ourselves.
  107. */
  108. run_queue = true;
  109. break;
  110. }
  111. blk_mq_set_rq_budget_token(rq, budget_token);
  112. /*
  113. * Now this rq owns the budget which has to be released
  114. * if this rq won't be queued to driver via .queue_rq()
  115. * in blk_mq_dispatch_rq_list().
  116. */
  117. list_add_tail(&rq->queuelist, &rq_list);
  118. count++;
  119. if (rq->mq_hctx != hctx)
  120. multi_hctxs = true;
  121. /*
  122. * If we cannot get tag for the request, stop dequeueing
  123. * requests from the IO scheduler. We are unlikely to be able
  124. * to submit them anyway and it creates false impression for
  125. * scheduling heuristics that the device can take more IO.
  126. */
  127. if (!blk_mq_get_driver_tag(rq))
  128. break;
  129. } while (count < max_dispatch);
  130. if (!count) {
  131. if (run_queue)
  132. blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
  133. } else if (multi_hctxs) {
  134. /*
  135. * Requests from different hctx may be dequeued from some
  136. * schedulers, such as bfq and deadline.
  137. *
  138. * Sort the requests in the list according to their hctx,
  139. * dispatch batching requests from same hctx at a time.
  140. */
  141. list_sort(NULL, &rq_list, sched_rq_cmp);
  142. do {
  143. dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
  144. } while (!list_empty(&rq_list));
  145. } else {
  146. dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
  147. }
  148. if (busy)
  149. return -EAGAIN;
  150. return !!dispatched;
  151. }
  152. static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  153. {
  154. unsigned long end = jiffies + HZ;
  155. int ret;
  156. do {
  157. ret = __blk_mq_do_dispatch_sched(hctx);
  158. if (ret != 1)
  159. break;
  160. if (need_resched() || time_is_before_jiffies(end)) {
  161. blk_mq_delay_run_hw_queue(hctx, 0);
  162. break;
  163. }
  164. } while (1);
  165. return ret;
  166. }
  167. static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  168. struct blk_mq_ctx *ctx)
  169. {
  170. unsigned short idx = ctx->index_hw[hctx->type];
  171. if (++idx == hctx->nr_ctx)
  172. idx = 0;
  173. return hctx->ctxs[idx];
  174. }
  175. /*
  176. * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  177. * its queue by itself in its completion handler, so we don't need to
  178. * restart queue if .get_budget() fails to get the budget.
  179. *
  180. * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
  181. * be run again. This is necessary to avoid starving flushes.
  182. */
  183. static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
  184. {
  185. struct request_queue *q = hctx->queue;
  186. LIST_HEAD(rq_list);
  187. struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
  188. int ret = 0;
  189. struct request *rq;
  190. do {
  191. int budget_token;
  192. if (!list_empty_careful(&hctx->dispatch)) {
  193. ret = -EAGAIN;
  194. break;
  195. }
  196. if (!sbitmap_any_bit_set(&hctx->ctx_map))
  197. break;
  198. budget_token = blk_mq_get_dispatch_budget(q);
  199. if (budget_token < 0)
  200. break;
  201. rq = blk_mq_dequeue_from_ctx(hctx, ctx);
  202. if (!rq) {
  203. blk_mq_put_dispatch_budget(q, budget_token);
  204. /*
  205. * We're releasing without dispatching. Holding the
  206. * budget could have blocked any "hctx"s with the
  207. * same queue and if we didn't dispatch then there's
  208. * no guarantee anyone will kick the queue. Kick it
  209. * ourselves.
  210. */
  211. blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
  212. break;
  213. }
  214. blk_mq_set_rq_budget_token(rq, budget_token);
  215. /*
  216. * Now this rq owns the budget which has to be released
  217. * if this rq won't be queued to driver via .queue_rq()
  218. * in blk_mq_dispatch_rq_list().
  219. */
  220. list_add(&rq->queuelist, &rq_list);
  221. /* round robin for fair dispatch */
  222. ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
  223. } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
  224. WRITE_ONCE(hctx->dispatch_from, ctx);
  225. return ret;
  226. }
  227. static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
  228. {
  229. bool need_dispatch = false;
  230. LIST_HEAD(rq_list);
  231. /*
  232. * If we have previous entries on our dispatch list, grab them first for
  233. * more fair dispatch.
  234. */
  235. if (!list_empty_careful(&hctx->dispatch)) {
  236. spin_lock(&hctx->lock);
  237. if (!list_empty(&hctx->dispatch))
  238. list_splice_init(&hctx->dispatch, &rq_list);
  239. spin_unlock(&hctx->lock);
  240. }
  241. /*
  242. * Only ask the scheduler for requests, if we didn't have residual
  243. * requests from the dispatch list. This is to avoid the case where
  244. * we only ever dispatch a fraction of the requests available because
  245. * of low device queue depth. Once we pull requests out of the IO
  246. * scheduler, we can no longer merge or sort them. So it's best to
  247. * leave them there for as long as we can. Mark the hw queue as
  248. * needing a restart in that case.
  249. *
  250. * We want to dispatch from the scheduler if there was nothing
  251. * on the dispatch list or we were able to dispatch from the
  252. * dispatch list.
  253. */
  254. if (!list_empty(&rq_list)) {
  255. blk_mq_sched_mark_restart_hctx(hctx);
  256. if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
  257. return 0;
  258. need_dispatch = true;
  259. } else {
  260. need_dispatch = hctx->dispatch_busy;
  261. }
  262. if (hctx->queue->elevator)
  263. return blk_mq_do_dispatch_sched(hctx);
  264. /* dequeue request one by one from sw queue if queue is busy */
  265. if (need_dispatch)
  266. return blk_mq_do_dispatch_ctx(hctx);
  267. blk_mq_flush_busy_ctxs(hctx, &rq_list);
  268. blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
  269. return 0;
  270. }
  271. void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
  272. {
  273. struct request_queue *q = hctx->queue;
  274. /* RCU or SRCU read lock is needed before checking quiesced flag */
  275. if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
  276. return;
  277. /*
  278. * A return of -EAGAIN is an indication that hctx->dispatch is not
  279. * empty and we must run again in order to avoid starving flushes.
  280. */
  281. if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
  282. if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
  283. blk_mq_run_hw_queue(hctx, true);
  284. }
  285. }
  286. bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
  287. unsigned int nr_segs)
  288. {
  289. struct elevator_queue *e = q->elevator;
  290. struct blk_mq_ctx *ctx;
  291. struct blk_mq_hw_ctx *hctx;
  292. bool ret = false;
  293. enum hctx_type type;
  294. if (e && e->type->ops.bio_merge) {
  295. ret = e->type->ops.bio_merge(q, bio, nr_segs);
  296. goto out_put;
  297. }
  298. ctx = blk_mq_get_ctx(q);
  299. hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
  300. type = hctx->type;
  301. if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
  302. list_empty_careful(&ctx->rq_lists[type]))
  303. goto out_put;
  304. /* default per sw-queue merge */
  305. spin_lock(&ctx->lock);
  306. /*
  307. * Reverse check our software queue for entries that we could
  308. * potentially merge with. Currently includes a hand-wavy stop
  309. * count of 8, to not spend too much time checking for merges.
  310. */
  311. if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
  312. ret = true;
  313. spin_unlock(&ctx->lock);
  314. out_put:
  315. return ret;
  316. }
  317. bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
  318. struct list_head *free)
  319. {
  320. return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
  321. }
  322. EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
  323. static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
  324. struct blk_mq_hw_ctx *hctx,
  325. unsigned int hctx_idx)
  326. {
  327. if (blk_mq_is_shared_tags(q->tag_set->flags)) {
  328. hctx->sched_tags = q->sched_shared_tags;
  329. return 0;
  330. }
  331. hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
  332. q->nr_requests);
  333. if (!hctx->sched_tags)
  334. return -ENOMEM;
  335. return 0;
  336. }
  337. static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
  338. {
  339. blk_mq_free_rq_map(queue->sched_shared_tags);
  340. queue->sched_shared_tags = NULL;
  341. }
  342. /* called in queue's release handler, tagset has gone away */
  343. static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
  344. {
  345. struct blk_mq_hw_ctx *hctx;
  346. unsigned long i;
  347. queue_for_each_hw_ctx(q, hctx, i) {
  348. if (hctx->sched_tags) {
  349. if (!blk_mq_is_shared_tags(flags))
  350. blk_mq_free_rq_map(hctx->sched_tags);
  351. hctx->sched_tags = NULL;
  352. }
  353. }
  354. if (blk_mq_is_shared_tags(flags))
  355. blk_mq_exit_sched_shared_tags(q);
  356. }
  357. static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
  358. {
  359. struct blk_mq_tag_set *set = queue->tag_set;
  360. /*
  361. * Set initial depth at max so that we don't need to reallocate for
  362. * updating nr_requests.
  363. */
  364. queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
  365. BLK_MQ_NO_HCTX_IDX,
  366. MAX_SCHED_RQ);
  367. if (!queue->sched_shared_tags)
  368. return -ENOMEM;
  369. blk_mq_tag_update_sched_shared_tags(queue);
  370. return 0;
  371. }
  372. /* caller must have a reference to @e, will grab another one if successful */
  373. int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
  374. {
  375. unsigned int flags = q->tag_set->flags;
  376. struct blk_mq_hw_ctx *hctx;
  377. struct elevator_queue *eq;
  378. unsigned long i;
  379. int ret;
  380. /*
  381. * Default to double of smaller one between hw queue_depth and 128,
  382. * since we don't split into sync/async like the old code did.
  383. * Additionally, this is a per-hw queue depth.
  384. */
  385. q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
  386. BLKDEV_DEFAULT_RQ);
  387. if (blk_mq_is_shared_tags(flags)) {
  388. ret = blk_mq_init_sched_shared_tags(q);
  389. if (ret)
  390. return ret;
  391. }
  392. queue_for_each_hw_ctx(q, hctx, i) {
  393. ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
  394. if (ret)
  395. goto err_free_map_and_rqs;
  396. }
  397. ret = e->ops.init_sched(q, e);
  398. if (ret)
  399. goto err_free_map_and_rqs;
  400. mutex_lock(&q->debugfs_mutex);
  401. blk_mq_debugfs_register_sched(q);
  402. mutex_unlock(&q->debugfs_mutex);
  403. queue_for_each_hw_ctx(q, hctx, i) {
  404. if (e->ops.init_hctx) {
  405. ret = e->ops.init_hctx(hctx, i);
  406. if (ret) {
  407. eq = q->elevator;
  408. blk_mq_sched_free_rqs(q);
  409. blk_mq_exit_sched(q, eq);
  410. kobject_put(&eq->kobj);
  411. return ret;
  412. }
  413. }
  414. mutex_lock(&q->debugfs_mutex);
  415. blk_mq_debugfs_register_sched_hctx(q, hctx);
  416. mutex_unlock(&q->debugfs_mutex);
  417. }
  418. return 0;
  419. err_free_map_and_rqs:
  420. blk_mq_sched_free_rqs(q);
  421. blk_mq_sched_tags_teardown(q, flags);
  422. q->elevator = NULL;
  423. return ret;
  424. }
  425. /*
  426. * called in either blk_queue_cleanup or elevator_switch, tagset
  427. * is required for freeing requests
  428. */
  429. void blk_mq_sched_free_rqs(struct request_queue *q)
  430. {
  431. struct blk_mq_hw_ctx *hctx;
  432. unsigned long i;
  433. if (blk_mq_is_shared_tags(q->tag_set->flags)) {
  434. blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
  435. BLK_MQ_NO_HCTX_IDX);
  436. } else {
  437. queue_for_each_hw_ctx(q, hctx, i) {
  438. if (hctx->sched_tags)
  439. blk_mq_free_rqs(q->tag_set,
  440. hctx->sched_tags, i);
  441. }
  442. }
  443. }
  444. void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
  445. {
  446. struct blk_mq_hw_ctx *hctx;
  447. unsigned long i;
  448. unsigned int flags = 0;
  449. queue_for_each_hw_ctx(q, hctx, i) {
  450. mutex_lock(&q->debugfs_mutex);
  451. blk_mq_debugfs_unregister_sched_hctx(hctx);
  452. mutex_unlock(&q->debugfs_mutex);
  453. if (e->type->ops.exit_hctx && hctx->sched_data) {
  454. e->type->ops.exit_hctx(hctx, i);
  455. hctx->sched_data = NULL;
  456. }
  457. flags = hctx->flags;
  458. }
  459. mutex_lock(&q->debugfs_mutex);
  460. blk_mq_debugfs_unregister_sched(q);
  461. mutex_unlock(&q->debugfs_mutex);
  462. if (e->type->ops.exit_sched)
  463. e->type->ops.exit_sched(e);
  464. blk_mq_sched_tags_teardown(q, flags);
  465. q->elevator = NULL;
  466. }