discard.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/jiffies.h>
  3. #include <linux/kernel.h>
  4. #include <linux/ktime.h>
  5. #include <linux/list.h>
  6. #include <linux/math64.h>
  7. #include <linux/sizes.h>
  8. #include <linux/workqueue.h>
  9. #include "ctree.h"
  10. #include "block-group.h"
  11. #include "discard.h"
  12. #include "free-space-cache.h"
  13. #include "fs.h"
  14. /*
  15. * This contains the logic to handle async discard.
  16. *
  17. * Async discard manages trimming of free space outside of transaction commit.
  18. * Discarding is done by managing the block_groups on a LRU list based on free
  19. * space recency. Two passes are used to first prioritize discarding extents
  20. * and then allow for trimming in the bitmap the best opportunity to coalesce.
  21. * The block_groups are maintained on multiple lists to allow for multiple
  22. * passes with different discard filter requirements. A delayed work item is
  23. * used to manage discarding with timeout determined by a max of the delay
  24. * incurred by the iops rate limit, the byte rate limit, and the max delay of
  25. * BTRFS_DISCARD_MAX_DELAY.
  26. *
  27. * Note, this only keeps track of block_groups that are explicitly for data.
  28. * Mixed block_groups are not supported.
  29. *
  30. * The first list is special to manage discarding of fully free block groups.
  31. * This is necessary because we issue a final trim for a full free block group
  32. * after forgetting it. When a block group becomes unused, instead of directly
  33. * being added to the unused_bgs list, we add it to this first list. Then
  34. * from there, if it becomes fully discarded, we place it onto the unused_bgs
  35. * list.
  36. *
  37. * The in-memory free space cache serves as the backing state for discard.
  38. * Consequently this means there is no persistence. We opt to load all the
  39. * block groups in as not discarded, so the mount case degenerates to the
  40. * crashing case.
  41. *
  42. * As the free space cache uses bitmaps, there exists a tradeoff between
  43. * ease/efficiency for find_free_extent() and the accuracy of discard state.
  44. * Here we opt to let untrimmed regions merge with everything while only letting
  45. * trimmed regions merge with other trimmed regions. This can cause
  46. * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
  47. * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
  48. * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
  49. * this resets the state and we will retry trimming the whole bitmap. This is a
  50. * tradeoff between discard state accuracy and the cost of accounting.
  51. */
  52. /* This is an initial delay to give some chance for block reuse */
  53. #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
  54. #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
  55. #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
  56. #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
  57. #define BTRFS_DISCARD_MAX_IOPS (1000U)
  58. /* Monotonically decreasing minimum length filters after index 0 */
  59. static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
  60. 0,
  61. BTRFS_ASYNC_DISCARD_MAX_FILTER,
  62. BTRFS_ASYNC_DISCARD_MIN_FILTER
  63. };
  64. static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
  65. const struct btrfs_block_group *block_group)
  66. {
  67. return &discard_ctl->discard_list[block_group->discard_index];
  68. }
  69. /*
  70. * Determine if async discard should be running.
  71. *
  72. * @discard_ctl: discard control
  73. *
  74. * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
  75. */
  76. static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
  77. {
  78. struct btrfs_fs_info *fs_info = container_of(discard_ctl,
  79. struct btrfs_fs_info,
  80. discard_ctl);
  81. return (!(fs_info->sb->s_flags & SB_RDONLY) &&
  82. test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
  83. }
  84. static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
  85. struct btrfs_block_group *block_group)
  86. {
  87. lockdep_assert_held(&discard_ctl->lock);
  88. if (list_empty(&block_group->discard_list) ||
  89. block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
  90. if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
  91. block_group->discard_index = BTRFS_DISCARD_INDEX_START;
  92. block_group->discard_eligible_time = (ktime_get_ns() +
  93. BTRFS_DISCARD_DELAY);
  94. block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
  95. }
  96. if (list_empty(&block_group->discard_list))
  97. btrfs_get_block_group(block_group);
  98. list_move_tail(&block_group->discard_list,
  99. get_discard_list(discard_ctl, block_group));
  100. }
  101. static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
  102. struct btrfs_block_group *block_group)
  103. {
  104. if (!btrfs_is_block_group_data_only(block_group))
  105. return;
  106. if (!btrfs_run_discard_work(discard_ctl))
  107. return;
  108. spin_lock(&discard_ctl->lock);
  109. __add_to_discard_list(discard_ctl, block_group);
  110. spin_unlock(&discard_ctl->lock);
  111. }
  112. static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
  113. struct btrfs_block_group *block_group)
  114. {
  115. bool queued;
  116. spin_lock(&discard_ctl->lock);
  117. queued = !list_empty(&block_group->discard_list);
  118. if (!btrfs_run_discard_work(discard_ctl)) {
  119. spin_unlock(&discard_ctl->lock);
  120. return;
  121. }
  122. list_del_init(&block_group->discard_list);
  123. block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
  124. block_group->discard_eligible_time = (ktime_get_ns() +
  125. BTRFS_DISCARD_UNUSED_DELAY);
  126. block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
  127. if (!queued)
  128. btrfs_get_block_group(block_group);
  129. list_add_tail(&block_group->discard_list,
  130. &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
  131. spin_unlock(&discard_ctl->lock);
  132. }
  133. static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
  134. struct btrfs_block_group *block_group)
  135. {
  136. bool running = false;
  137. bool queued = false;
  138. spin_lock(&discard_ctl->lock);
  139. if (block_group == discard_ctl->block_group) {
  140. running = true;
  141. discard_ctl->block_group = NULL;
  142. }
  143. block_group->discard_eligible_time = 0;
  144. queued = !list_empty(&block_group->discard_list);
  145. list_del_init(&block_group->discard_list);
  146. if (queued)
  147. btrfs_put_block_group(block_group);
  148. spin_unlock(&discard_ctl->lock);
  149. return running;
  150. }
  151. /*
  152. * Find block_group that's up next for discarding.
  153. *
  154. * @discard_ctl: discard control
  155. * @now: current time
  156. *
  157. * Iterate over the discard lists to find the next block_group up for
  158. * discarding checking the discard_eligible_time of block_group.
  159. */
  160. static struct btrfs_block_group *find_next_block_group(
  161. struct btrfs_discard_ctl *discard_ctl,
  162. u64 now)
  163. {
  164. struct btrfs_block_group *ret_block_group = NULL, *block_group;
  165. int i;
  166. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
  167. struct list_head *discard_list = &discard_ctl->discard_list[i];
  168. if (!list_empty(discard_list)) {
  169. block_group = list_first_entry(discard_list,
  170. struct btrfs_block_group,
  171. discard_list);
  172. if (!ret_block_group)
  173. ret_block_group = block_group;
  174. if (ret_block_group->discard_eligible_time < now)
  175. break;
  176. if (ret_block_group->discard_eligible_time >
  177. block_group->discard_eligible_time)
  178. ret_block_group = block_group;
  179. }
  180. }
  181. return ret_block_group;
  182. }
  183. /*
  184. * Look up next block group and set it for use.
  185. *
  186. * @discard_ctl: discard control
  187. * @discard_state: the discard_state of the block_group after state management
  188. * @discard_index: the discard_index of the block_group after state management
  189. * @now: time when discard was invoked, in ns
  190. *
  191. * Wrap find_next_block_group() and set the block_group to be in use.
  192. * @discard_state's control flow is managed here. Variables related to
  193. * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
  194. * and @discard_index are remembered as it may change while we're discarding,
  195. * but we want the discard to execute in the context determined here.
  196. */
  197. static struct btrfs_block_group *peek_discard_list(
  198. struct btrfs_discard_ctl *discard_ctl,
  199. enum btrfs_discard_state *discard_state,
  200. int *discard_index, u64 now)
  201. {
  202. struct btrfs_block_group *block_group;
  203. spin_lock(&discard_ctl->lock);
  204. again:
  205. block_group = find_next_block_group(discard_ctl, now);
  206. if (block_group && now >= block_group->discard_eligible_time) {
  207. if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
  208. block_group->used != 0) {
  209. if (btrfs_is_block_group_data_only(block_group)) {
  210. __add_to_discard_list(discard_ctl, block_group);
  211. /*
  212. * The block group must have been moved to other
  213. * discard list even if discard was disabled in
  214. * the meantime or a transaction abort happened,
  215. * otherwise we can end up in an infinite loop,
  216. * always jumping into the 'again' label and
  217. * keep getting this block group over and over
  218. * in case there are no other block groups in
  219. * the discard lists.
  220. */
  221. ASSERT(block_group->discard_index !=
  222. BTRFS_DISCARD_INDEX_UNUSED);
  223. } else {
  224. list_del_init(&block_group->discard_list);
  225. btrfs_put_block_group(block_group);
  226. }
  227. goto again;
  228. }
  229. if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
  230. block_group->discard_cursor = block_group->start;
  231. block_group->discard_state = BTRFS_DISCARD_EXTENTS;
  232. }
  233. }
  234. if (block_group) {
  235. btrfs_get_block_group(block_group);
  236. discard_ctl->block_group = block_group;
  237. *discard_state = block_group->discard_state;
  238. *discard_index = block_group->discard_index;
  239. }
  240. spin_unlock(&discard_ctl->lock);
  241. return block_group;
  242. }
  243. /*
  244. * Update a block group's filters.
  245. *
  246. * @block_group: block group of interest
  247. * @bytes: recently freed region size after coalescing
  248. *
  249. * Async discard maintains multiple lists with progressively smaller filters
  250. * to prioritize discarding based on size. Should a free space that matches
  251. * a larger filter be returned to the free_space_cache, prioritize that discard
  252. * by moving @block_group to the proper filter.
  253. */
  254. void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
  255. u64 bytes)
  256. {
  257. struct btrfs_discard_ctl *discard_ctl;
  258. if (!block_group ||
  259. !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
  260. return;
  261. discard_ctl = &block_group->fs_info->discard_ctl;
  262. if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
  263. bytes >= discard_minlen[block_group->discard_index - 1]) {
  264. int i;
  265. remove_from_discard_list(discard_ctl, block_group);
  266. for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
  267. i++) {
  268. if (bytes >= discard_minlen[i]) {
  269. block_group->discard_index = i;
  270. add_to_discard_list(discard_ctl, block_group);
  271. break;
  272. }
  273. }
  274. }
  275. }
  276. /*
  277. * Move a block group along the discard lists.
  278. *
  279. * @discard_ctl: discard control
  280. * @block_group: block_group of interest
  281. *
  282. * Increment @block_group's discard_index. If it falls of the list, let it be.
  283. * Otherwise add it back to the appropriate list.
  284. */
  285. static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
  286. struct btrfs_block_group *block_group)
  287. {
  288. block_group->discard_index++;
  289. if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
  290. block_group->discard_index = 1;
  291. return;
  292. }
  293. add_to_discard_list(discard_ctl, block_group);
  294. }
  295. /*
  296. * Remove a block_group from the discard lists.
  297. *
  298. * @discard_ctl: discard control
  299. * @block_group: block_group of interest
  300. *
  301. * Remove @block_group from the discard lists. If necessary, wait on the
  302. * current work and then reschedule the delayed work.
  303. */
  304. void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
  305. struct btrfs_block_group *block_group)
  306. {
  307. if (remove_from_discard_list(discard_ctl, block_group)) {
  308. cancel_delayed_work_sync(&discard_ctl->work);
  309. btrfs_discard_schedule_work(discard_ctl, true);
  310. }
  311. }
  312. /*
  313. * Handles queuing the block_groups.
  314. *
  315. * @discard_ctl: discard control
  316. * @block_group: block_group of interest
  317. *
  318. * Maintain the LRU order of the discard lists.
  319. */
  320. void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
  321. struct btrfs_block_group *block_group)
  322. {
  323. if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
  324. return;
  325. if (block_group->used == 0)
  326. add_to_discard_unused_list(discard_ctl, block_group);
  327. else
  328. add_to_discard_list(discard_ctl, block_group);
  329. if (!delayed_work_pending(&discard_ctl->work))
  330. btrfs_discard_schedule_work(discard_ctl, false);
  331. }
  332. static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
  333. u64 now, bool override)
  334. {
  335. struct btrfs_block_group *block_group;
  336. if (!btrfs_run_discard_work(discard_ctl))
  337. return;
  338. if (!override && delayed_work_pending(&discard_ctl->work))
  339. return;
  340. block_group = find_next_block_group(discard_ctl, now);
  341. if (block_group) {
  342. u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
  343. u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
  344. /*
  345. * A single delayed workqueue item is responsible for
  346. * discarding, so we can manage the bytes rate limit by keeping
  347. * track of the previous discard.
  348. */
  349. if (kbps_limit && discard_ctl->prev_discard) {
  350. u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
  351. u64 bps_delay = div64_u64(discard_ctl->prev_discard *
  352. NSEC_PER_SEC, bps_limit);
  353. delay = max(delay, bps_delay);
  354. }
  355. /*
  356. * This timeout is to hopefully prevent immediate discarding
  357. * in a recently allocated block group.
  358. */
  359. if (now < block_group->discard_eligible_time) {
  360. u64 bg_timeout = block_group->discard_eligible_time - now;
  361. delay = max(delay, bg_timeout);
  362. }
  363. if (override && discard_ctl->prev_discard) {
  364. u64 elapsed = now - discard_ctl->prev_discard_time;
  365. if (delay > elapsed)
  366. delay -= elapsed;
  367. else
  368. delay = 0;
  369. }
  370. mod_delayed_work(discard_ctl->discard_workers,
  371. &discard_ctl->work, nsecs_to_jiffies(delay));
  372. }
  373. }
  374. /*
  375. * Responsible for scheduling the discard work.
  376. *
  377. * @discard_ctl: discard control
  378. * @override: override the current timer
  379. *
  380. * Discards are issued by a delayed workqueue item. @override is used to
  381. * update the current delay as the baseline delay interval is reevaluated on
  382. * transaction commit. This is also maxed with any other rate limit.
  383. */
  384. void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
  385. bool override)
  386. {
  387. const u64 now = ktime_get_ns();
  388. spin_lock(&discard_ctl->lock);
  389. __btrfs_discard_schedule_work(discard_ctl, now, override);
  390. spin_unlock(&discard_ctl->lock);
  391. }
  392. /*
  393. * Determine next step of a block_group.
  394. *
  395. * @discard_ctl: discard control
  396. * @block_group: block_group of interest
  397. *
  398. * Determine the next step for a block group after it's finished going through
  399. * a pass on a discard list. If it is unused and fully trimmed, we can mark it
  400. * unused and send it to the unused_bgs path. Otherwise, pass it onto the
  401. * appropriate filter list or let it fall off.
  402. */
  403. static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
  404. struct btrfs_block_group *block_group)
  405. {
  406. remove_from_discard_list(discard_ctl, block_group);
  407. if (block_group->used == 0) {
  408. if (btrfs_is_free_space_trimmed(block_group))
  409. btrfs_mark_bg_unused(block_group);
  410. else
  411. add_to_discard_unused_list(discard_ctl, block_group);
  412. } else {
  413. btrfs_update_discard_index(discard_ctl, block_group);
  414. }
  415. }
  416. /*
  417. * Discard work queue callback
  418. *
  419. * @work: work
  420. *
  421. * Find the next block_group to start discarding and then discard a single
  422. * region. It does this in a two-pass fashion: first extents and second
  423. * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
  424. */
  425. static void btrfs_discard_workfn(struct work_struct *work)
  426. {
  427. struct btrfs_discard_ctl *discard_ctl;
  428. struct btrfs_block_group *block_group;
  429. enum btrfs_discard_state discard_state;
  430. int discard_index = 0;
  431. u64 trimmed = 0;
  432. u64 minlen = 0;
  433. u64 now = ktime_get_ns();
  434. discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
  435. block_group = peek_discard_list(discard_ctl, &discard_state,
  436. &discard_index, now);
  437. if (!block_group)
  438. return;
  439. if (!btrfs_run_discard_work(discard_ctl)) {
  440. spin_lock(&discard_ctl->lock);
  441. btrfs_put_block_group(block_group);
  442. discard_ctl->block_group = NULL;
  443. spin_unlock(&discard_ctl->lock);
  444. return;
  445. }
  446. if (now < block_group->discard_eligible_time) {
  447. spin_lock(&discard_ctl->lock);
  448. btrfs_put_block_group(block_group);
  449. discard_ctl->block_group = NULL;
  450. spin_unlock(&discard_ctl->lock);
  451. btrfs_discard_schedule_work(discard_ctl, false);
  452. return;
  453. }
  454. /* Perform discarding */
  455. minlen = discard_minlen[discard_index];
  456. if (discard_state == BTRFS_DISCARD_BITMAPS) {
  457. u64 maxlen = 0;
  458. /*
  459. * Use the previous levels minimum discard length as the max
  460. * length filter. In the case something is added to make a
  461. * region go beyond the max filter, the entire bitmap is set
  462. * back to BTRFS_TRIM_STATE_UNTRIMMED.
  463. */
  464. if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
  465. maxlen = discard_minlen[discard_index - 1];
  466. btrfs_trim_block_group_bitmaps(block_group, &trimmed,
  467. block_group->discard_cursor,
  468. btrfs_block_group_end(block_group),
  469. minlen, maxlen, true);
  470. discard_ctl->discard_bitmap_bytes += trimmed;
  471. } else {
  472. btrfs_trim_block_group_extents(block_group, &trimmed,
  473. block_group->discard_cursor,
  474. btrfs_block_group_end(block_group),
  475. minlen, true);
  476. discard_ctl->discard_extent_bytes += trimmed;
  477. }
  478. /* Determine next steps for a block_group */
  479. if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
  480. if (discard_state == BTRFS_DISCARD_BITMAPS) {
  481. btrfs_finish_discard_pass(discard_ctl, block_group);
  482. } else {
  483. block_group->discard_cursor = block_group->start;
  484. spin_lock(&discard_ctl->lock);
  485. if (block_group->discard_state !=
  486. BTRFS_DISCARD_RESET_CURSOR)
  487. block_group->discard_state =
  488. BTRFS_DISCARD_BITMAPS;
  489. spin_unlock(&discard_ctl->lock);
  490. }
  491. }
  492. now = ktime_get_ns();
  493. spin_lock(&discard_ctl->lock);
  494. discard_ctl->prev_discard = trimmed;
  495. discard_ctl->prev_discard_time = now;
  496. btrfs_put_block_group(block_group);
  497. discard_ctl->block_group = NULL;
  498. __btrfs_discard_schedule_work(discard_ctl, now, false);
  499. spin_unlock(&discard_ctl->lock);
  500. }
  501. /*
  502. * Recalculate the base delay.
  503. *
  504. * @discard_ctl: discard control
  505. *
  506. * Recalculate the base delay which is based off the total number of
  507. * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
  508. * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
  509. */
  510. void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
  511. {
  512. s32 discardable_extents;
  513. s64 discardable_bytes;
  514. u32 iops_limit;
  515. unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
  516. unsigned long delay;
  517. discardable_extents = atomic_read(&discard_ctl->discardable_extents);
  518. if (!discardable_extents)
  519. return;
  520. spin_lock(&discard_ctl->lock);
  521. /*
  522. * The following is to fix a potential -1 discrepancy that we're not
  523. * sure how to reproduce. But given that this is the only place that
  524. * utilizes these numbers and this is only called by from
  525. * btrfs_finish_extent_commit() which is synchronized, we can correct
  526. * here.
  527. */
  528. if (discardable_extents < 0)
  529. atomic_add(-discardable_extents,
  530. &discard_ctl->discardable_extents);
  531. discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
  532. if (discardable_bytes < 0)
  533. atomic64_add(-discardable_bytes,
  534. &discard_ctl->discardable_bytes);
  535. if (discardable_extents <= 0) {
  536. spin_unlock(&discard_ctl->lock);
  537. return;
  538. }
  539. iops_limit = READ_ONCE(discard_ctl->iops_limit);
  540. if (iops_limit) {
  541. delay = MSEC_PER_SEC / iops_limit;
  542. } else {
  543. /*
  544. * Unset iops_limit means go as fast as possible, so allow a
  545. * delay of 0.
  546. */
  547. delay = 0;
  548. min_delay = 0;
  549. }
  550. delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
  551. discard_ctl->delay_ms = delay;
  552. spin_unlock(&discard_ctl->lock);
  553. }
  554. /*
  555. * Propagate discard counters.
  556. *
  557. * @block_group: block_group of interest
  558. *
  559. * Propagate deltas of counters up to the discard_ctl. It maintains a current
  560. * counter and a previous counter passing the delta up to the global stat.
  561. * Then the current counter value becomes the previous counter value.
  562. */
  563. void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
  564. {
  565. struct btrfs_free_space_ctl *ctl;
  566. struct btrfs_discard_ctl *discard_ctl;
  567. s32 extents_delta;
  568. s64 bytes_delta;
  569. if (!block_group ||
  570. !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
  571. !btrfs_is_block_group_data_only(block_group))
  572. return;
  573. ctl = block_group->free_space_ctl;
  574. discard_ctl = &block_group->fs_info->discard_ctl;
  575. lockdep_assert_held(&ctl->tree_lock);
  576. extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
  577. ctl->discardable_extents[BTRFS_STAT_PREV];
  578. if (extents_delta) {
  579. atomic_add(extents_delta, &discard_ctl->discardable_extents);
  580. ctl->discardable_extents[BTRFS_STAT_PREV] =
  581. ctl->discardable_extents[BTRFS_STAT_CURR];
  582. }
  583. bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
  584. ctl->discardable_bytes[BTRFS_STAT_PREV];
  585. if (bytes_delta) {
  586. atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
  587. ctl->discardable_bytes[BTRFS_STAT_PREV] =
  588. ctl->discardable_bytes[BTRFS_STAT_CURR];
  589. }
  590. }
  591. /*
  592. * Punt unused_bgs list to discard lists.
  593. *
  594. * @fs_info: fs_info of interest
  595. *
  596. * The unused_bgs list needs to be punted to the discard lists because the
  597. * order of operations is changed. In the normal synchronous discard path, the
  598. * block groups are trimmed via a single large trim in transaction commit. This
  599. * is ultimately what we are trying to avoid with asynchronous discard. Thus,
  600. * it must be done before going down the unused_bgs path.
  601. */
  602. void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
  603. {
  604. struct btrfs_block_group *block_group, *next;
  605. spin_lock(&fs_info->unused_bgs_lock);
  606. /* We enabled async discard, so punt all to the queue */
  607. list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
  608. bg_list) {
  609. list_del_init(&block_group->bg_list);
  610. btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
  611. /*
  612. * This put is for the get done by btrfs_mark_bg_unused.
  613. * Queueing discard incremented it for discard's reference.
  614. */
  615. btrfs_put_block_group(block_group);
  616. }
  617. spin_unlock(&fs_info->unused_bgs_lock);
  618. }
  619. /*
  620. * Purge discard lists.
  621. *
  622. * @discard_ctl: discard control
  623. *
  624. * If we are disabling async discard, we may have intercepted block groups that
  625. * are completely free and ready for the unused_bgs path. As discarding will
  626. * now happen in transaction commit or not at all, we can safely mark the
  627. * corresponding block groups as unused and they will be sent on their merry
  628. * way to the unused_bgs list.
  629. */
  630. static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
  631. {
  632. struct btrfs_block_group *block_group, *next;
  633. int i;
  634. spin_lock(&discard_ctl->lock);
  635. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
  636. list_for_each_entry_safe(block_group, next,
  637. &discard_ctl->discard_list[i],
  638. discard_list) {
  639. list_del_init(&block_group->discard_list);
  640. spin_unlock(&discard_ctl->lock);
  641. if (block_group->used == 0)
  642. btrfs_mark_bg_unused(block_group);
  643. spin_lock(&discard_ctl->lock);
  644. btrfs_put_block_group(block_group);
  645. }
  646. }
  647. spin_unlock(&discard_ctl->lock);
  648. }
  649. void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
  650. {
  651. if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
  652. btrfs_discard_cleanup(fs_info);
  653. return;
  654. }
  655. btrfs_discard_punt_unused_bgs_list(fs_info);
  656. set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
  657. }
  658. void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
  659. {
  660. clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
  661. }
  662. void btrfs_discard_init(struct btrfs_fs_info *fs_info)
  663. {
  664. struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
  665. int i;
  666. spin_lock_init(&discard_ctl->lock);
  667. INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
  668. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
  669. INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
  670. discard_ctl->prev_discard = 0;
  671. discard_ctl->prev_discard_time = 0;
  672. atomic_set(&discard_ctl->discardable_extents, 0);
  673. atomic64_set(&discard_ctl->discardable_bytes, 0);
  674. discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
  675. discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
  676. discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
  677. discard_ctl->kbps_limit = 0;
  678. discard_ctl->discard_extent_bytes = 0;
  679. discard_ctl->discard_bitmap_bytes = 0;
  680. atomic64_set(&discard_ctl->discard_bytes_saved, 0);
  681. }
  682. void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
  683. {
  684. btrfs_discard_stop(fs_info);
  685. cancel_delayed_work_sync(&fs_info->discard_ctl.work);
  686. btrfs_discard_purge_list(&fs_info->discard_ctl);
  687. }