xfs_discard.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2010, 2023 Red Hat, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include "xfs_shared.h"
  8. #include "xfs_format.h"
  9. #include "xfs_log_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_trans.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_btree.h"
  14. #include "xfs_alloc_btree.h"
  15. #include "xfs_alloc.h"
  16. #include "xfs_discard.h"
  17. #include "xfs_error.h"
  18. #include "xfs_extent_busy.h"
  19. #include "xfs_trace.h"
  20. #include "xfs_log.h"
  21. #include "xfs_ag.h"
  22. #include "xfs_health.h"
  23. #include "xfs_rtbitmap.h"
  24. /*
  25. * Notes on an efficient, low latency fstrim algorithm
  26. *
  27. * We need to walk the filesystem free space and issue discards on the free
  28. * space that meet the search criteria (size and location). We cannot issue
  29. * discards on extents that might be in use, or are so recently in use they are
  30. * still marked as busy. To serialise against extent state changes whilst we are
  31. * gathering extents to trim, we must hold the AGF lock to lock out other
  32. * allocations and extent free operations that might change extent state.
  33. *
  34. * However, we cannot just hold the AGF for the entire AG free space walk whilst
  35. * we issue discards on each free space that is found. Storage devices can have
  36. * extremely slow discard implementations (e.g. ceph RBD) and so walking a
  37. * couple of million free extents and issuing synchronous discards on each
  38. * extent can take a *long* time. Whilst we are doing this walk, nothing else
  39. * can access the AGF, and we can stall transactions and hence the log whilst
  40. * modifications wait for the AGF lock to be released. This can lead hung tasks
  41. * kicking the hung task timer and rebooting the system. This is bad.
  42. *
  43. * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
  44. * lock, gathers a range of inode cluster buffers that are allocated, drops the
  45. * AGI lock and then reads all the inode cluster buffers and processes them. It
  46. * loops doing this, using a cursor to keep track of where it is up to in the AG
  47. * for each iteration to restart the INOBT lookup from.
  48. *
  49. * We can't do this exactly with free space - once we drop the AGF lock, the
  50. * state of the free extent is out of our control and we cannot run a discard
  51. * safely on it in this situation. Unless, of course, we've marked the free
  52. * extent as busy and undergoing a discard operation whilst we held the AGF
  53. * locked.
  54. *
  55. * This is exactly how online discard works - free extents are marked busy when
  56. * they are freed, and once the extent free has been committed to the journal,
  57. * the busy extent record is marked as "undergoing discard" and the discard is
  58. * then issued on the free extent. Once the discard completes, the busy extent
  59. * record is removed and the extent is able to be allocated again.
  60. *
  61. * In the context of fstrim, if we find a free extent we need to discard, we
  62. * don't have to discard it immediately. All we need to do it record that free
  63. * extent as being busy and under discard, and all the allocation routines will
  64. * now avoid trying to allocate it. Hence if we mark the extent as busy under
  65. * the AGF lock, we can safely discard it without holding the AGF lock because
  66. * nothing will attempt to allocate that free space until the discard completes.
  67. *
  68. * This also allows us to issue discards asynchronously like we do with online
  69. * discard, and so for fast devices fstrim will run much faster as we can have
  70. * multiple discard operations in flight at once, as well as pipeline the free
  71. * extent search so that it overlaps in flight discard IO.
  72. */
  73. struct workqueue_struct *xfs_discard_wq;
  74. static void
  75. xfs_discard_endio_work(
  76. struct work_struct *work)
  77. {
  78. struct xfs_busy_extents *extents =
  79. container_of(work, struct xfs_busy_extents, endio_work);
  80. xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
  81. kfree(extents->owner);
  82. }
  83. /*
  84. * Queue up the actual completion to a thread to avoid IRQ-safe locking for
  85. * pagb_lock.
  86. */
  87. static void
  88. xfs_discard_endio(
  89. struct bio *bio)
  90. {
  91. struct xfs_busy_extents *extents = bio->bi_private;
  92. INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
  93. queue_work(xfs_discard_wq, &extents->endio_work);
  94. bio_put(bio);
  95. }
  96. /*
  97. * Walk the discard list and issue discards on all the busy extents in the
  98. * list. We plug and chain the bios so that we only need a single completion
  99. * call to clear all the busy extents once the discards are complete.
  100. */
  101. int
  102. xfs_discard_extents(
  103. struct xfs_mount *mp,
  104. struct xfs_busy_extents *extents)
  105. {
  106. struct xfs_extent_busy *busyp;
  107. struct bio *bio = NULL;
  108. struct blk_plug plug;
  109. int error = 0;
  110. blk_start_plug(&plug);
  111. list_for_each_entry(busyp, &extents->extent_list, list) {
  112. trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
  113. busyp->length);
  114. error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
  115. XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
  116. XFS_FSB_TO_BB(mp, busyp->length),
  117. GFP_KERNEL, &bio);
  118. if (error && error != -EOPNOTSUPP) {
  119. xfs_info(mp,
  120. "discard failed for extent [0x%llx,%u], error %d",
  121. (unsigned long long)busyp->bno,
  122. busyp->length,
  123. error);
  124. break;
  125. }
  126. }
  127. if (bio) {
  128. bio->bi_private = extents;
  129. bio->bi_end_io = xfs_discard_endio;
  130. submit_bio(bio);
  131. } else {
  132. xfs_discard_endio_work(&extents->endio_work);
  133. }
  134. blk_finish_plug(&plug);
  135. return error;
  136. }
  137. /*
  138. * Care must be taken setting up the trim cursor as the perags may not have been
  139. * initialised when the cursor is initialised. e.g. a clean mount which hasn't
  140. * read in AGFs and the first operation run on the mounted fs is a trim. This
  141. * can result in perag fields that aren't initialised until
  142. * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
  143. * the free space search.
  144. */
  145. struct xfs_trim_cur {
  146. xfs_agblock_t start;
  147. xfs_extlen_t count;
  148. xfs_agblock_t end;
  149. xfs_extlen_t minlen;
  150. bool by_bno;
  151. };
  152. static int
  153. xfs_trim_gather_extents(
  154. struct xfs_perag *pag,
  155. struct xfs_trim_cur *tcur,
  156. struct xfs_busy_extents *extents)
  157. {
  158. struct xfs_mount *mp = pag->pag_mount;
  159. struct xfs_trans *tp;
  160. struct xfs_btree_cur *cur;
  161. struct xfs_buf *agbp;
  162. int error;
  163. int i;
  164. int batch = 100;
  165. /*
  166. * Force out the log. This means any transactions that might have freed
  167. * space before we take the AGF buffer lock are now on disk, and the
  168. * volatile disk cache is flushed.
  169. */
  170. xfs_log_force(mp, XFS_LOG_SYNC);
  171. error = xfs_trans_alloc_empty(mp, &tp);
  172. if (error)
  173. return error;
  174. error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
  175. if (error)
  176. goto out_trans_cancel;
  177. /*
  178. * First time through tcur->count will not have been initialised as
  179. * pag->pagf_longest is not guaranteed to be valid before we read
  180. * the AGF buffer above.
  181. */
  182. if (!tcur->count)
  183. tcur->count = pag->pagf_longest;
  184. if (tcur->by_bno) {
  185. /* sub-AG discard request always starts at tcur->start */
  186. cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
  187. error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
  188. if (!error && !i)
  189. error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
  190. } else if (tcur->start == 0) {
  191. /* first time through a by-len starts with max length */
  192. cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
  193. error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
  194. } else {
  195. /* nth time through a by-len starts where we left off */
  196. cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
  197. error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
  198. }
  199. if (error)
  200. goto out_del_cursor;
  201. if (i == 0) {
  202. /* nothing of that length left in the AG, we are done */
  203. tcur->count = 0;
  204. goto out_del_cursor;
  205. }
  206. /*
  207. * Loop until we are done with all extents that are large
  208. * enough to be worth discarding or we hit batch limits.
  209. */
  210. while (i) {
  211. xfs_agblock_t fbno;
  212. xfs_extlen_t flen;
  213. error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
  214. if (error)
  215. break;
  216. if (XFS_IS_CORRUPT(mp, i != 1)) {
  217. xfs_btree_mark_sick(cur);
  218. error = -EFSCORRUPTED;
  219. break;
  220. }
  221. if (--batch <= 0) {
  222. /*
  223. * Update the cursor to point at this extent so we
  224. * restart the next batch from this extent.
  225. */
  226. tcur->start = fbno;
  227. tcur->count = flen;
  228. break;
  229. }
  230. /*
  231. * If the extent is entirely outside of the range we are
  232. * supposed to skip it. Do not bother to trim down partially
  233. * overlapping ranges for now.
  234. */
  235. if (fbno + flen < tcur->start) {
  236. trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
  237. goto next_extent;
  238. }
  239. if (fbno > tcur->end) {
  240. trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
  241. if (tcur->by_bno) {
  242. tcur->count = 0;
  243. break;
  244. }
  245. goto next_extent;
  246. }
  247. /* Trim the extent returned to the range we want. */
  248. if (fbno < tcur->start) {
  249. flen -= tcur->start - fbno;
  250. fbno = tcur->start;
  251. }
  252. if (fbno + flen > tcur->end + 1)
  253. flen = tcur->end - fbno + 1;
  254. /* Too small? Give up. */
  255. if (flen < tcur->minlen) {
  256. trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
  257. if (tcur->by_bno)
  258. goto next_extent;
  259. tcur->count = 0;
  260. break;
  261. }
  262. /*
  263. * If any blocks in the range are still busy, skip the
  264. * discard and try again the next time.
  265. */
  266. if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
  267. trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen);
  268. goto next_extent;
  269. }
  270. xfs_extent_busy_insert_discard(pag, fbno, flen,
  271. &extents->extent_list);
  272. next_extent:
  273. if (tcur->by_bno)
  274. error = xfs_btree_increment(cur, 0, &i);
  275. else
  276. error = xfs_btree_decrement(cur, 0, &i);
  277. if (error)
  278. break;
  279. /*
  280. * If there's no more records in the tree, we are done. Set the
  281. * cursor block count to 0 to indicate to the caller that there
  282. * is no more extents to search.
  283. */
  284. if (i == 0)
  285. tcur->count = 0;
  286. }
  287. /*
  288. * If there was an error, release all the gathered busy extents because
  289. * we aren't going to issue a discard on them any more.
  290. */
  291. if (error)
  292. xfs_extent_busy_clear(mp, &extents->extent_list, false);
  293. out_del_cursor:
  294. xfs_btree_del_cursor(cur, error);
  295. out_trans_cancel:
  296. xfs_trans_cancel(tp);
  297. return error;
  298. }
  299. static bool
  300. xfs_trim_should_stop(void)
  301. {
  302. return fatal_signal_pending(current) || freezing(current);
  303. }
  304. /*
  305. * Iterate the free list gathering extents and discarding them. We need a cursor
  306. * for the repeated iteration of gather/discard loop, so use the longest extent
  307. * we found in the last batch as the key to start the next.
  308. */
  309. static int
  310. xfs_trim_perag_extents(
  311. struct xfs_perag *pag,
  312. xfs_agblock_t start,
  313. xfs_agblock_t end,
  314. xfs_extlen_t minlen)
  315. {
  316. struct xfs_trim_cur tcur = {
  317. .start = start,
  318. .end = end,
  319. .minlen = minlen,
  320. };
  321. int error = 0;
  322. if (start != 0 || end != pag->block_count)
  323. tcur.by_bno = true;
  324. do {
  325. struct xfs_busy_extents *extents;
  326. extents = kzalloc(sizeof(*extents), GFP_KERNEL);
  327. if (!extents) {
  328. error = -ENOMEM;
  329. break;
  330. }
  331. extents->mount = pag->pag_mount;
  332. extents->owner = extents;
  333. INIT_LIST_HEAD(&extents->extent_list);
  334. error = xfs_trim_gather_extents(pag, &tcur, extents);
  335. if (error) {
  336. kfree(extents);
  337. break;
  338. }
  339. /*
  340. * We hand the extent list to the discard function here so the
  341. * discarded extents can be removed from the busy extent list.
  342. * This allows the discards to run asynchronously with gathering
  343. * the next round of extents to discard.
  344. *
  345. * However, we must ensure that we do not reference the extent
  346. * list after this function call, as it may have been freed by
  347. * the time control returns to us.
  348. */
  349. error = xfs_discard_extents(pag->pag_mount, extents);
  350. if (error)
  351. break;
  352. if (xfs_trim_should_stop())
  353. break;
  354. } while (tcur.count != 0);
  355. return error;
  356. }
  357. static int
  358. xfs_trim_datadev_extents(
  359. struct xfs_mount *mp,
  360. xfs_daddr_t start,
  361. xfs_daddr_t end,
  362. xfs_extlen_t minlen)
  363. {
  364. xfs_agnumber_t start_agno, end_agno;
  365. xfs_agblock_t start_agbno, end_agbno;
  366. xfs_daddr_t ddev_end;
  367. struct xfs_perag *pag;
  368. int last_error = 0, error;
  369. ddev_end = min_t(xfs_daddr_t, end,
  370. XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
  371. start_agno = xfs_daddr_to_agno(mp, start);
  372. start_agbno = xfs_daddr_to_agbno(mp, start);
  373. end_agno = xfs_daddr_to_agno(mp, ddev_end);
  374. end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
  375. for_each_perag_range(mp, start_agno, end_agno, pag) {
  376. xfs_agblock_t agend = pag->block_count;
  377. if (start_agno == end_agno)
  378. agend = end_agbno;
  379. error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
  380. if (error)
  381. last_error = error;
  382. if (xfs_trim_should_stop()) {
  383. xfs_perag_rele(pag);
  384. break;
  385. }
  386. start_agbno = 0;
  387. }
  388. return last_error;
  389. }
  390. #ifdef CONFIG_XFS_RT
  391. struct xfs_trim_rtdev {
  392. /* list of rt extents to free */
  393. struct list_head extent_list;
  394. /* minimum length that caller allows us to trim */
  395. xfs_rtblock_t minlen_fsb;
  396. /* restart point for the rtbitmap walk */
  397. xfs_rtxnum_t restart_rtx;
  398. /* stopping point for the current rtbitmap walk */
  399. xfs_rtxnum_t stop_rtx;
  400. };
  401. struct xfs_rtx_busy {
  402. struct list_head list;
  403. xfs_rtblock_t bno;
  404. xfs_rtblock_t length;
  405. };
  406. static void
  407. xfs_discard_free_rtdev_extents(
  408. struct xfs_trim_rtdev *tr)
  409. {
  410. struct xfs_rtx_busy *busyp, *n;
  411. list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
  412. list_del_init(&busyp->list);
  413. kfree(busyp);
  414. }
  415. }
  416. /*
  417. * Walk the discard list and issue discards on all the busy extents in the
  418. * list. We plug and chain the bios so that we only need a single completion
  419. * call to clear all the busy extents once the discards are complete.
  420. */
  421. static int
  422. xfs_discard_rtdev_extents(
  423. struct xfs_mount *mp,
  424. struct xfs_trim_rtdev *tr)
  425. {
  426. struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
  427. struct xfs_rtx_busy *busyp;
  428. struct bio *bio = NULL;
  429. struct blk_plug plug;
  430. xfs_rtblock_t start = NULLRTBLOCK, length = 0;
  431. int error = 0;
  432. blk_start_plug(&plug);
  433. list_for_each_entry(busyp, &tr->extent_list, list) {
  434. if (start == NULLRTBLOCK)
  435. start = busyp->bno;
  436. length += busyp->length;
  437. trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
  438. error = __blkdev_issue_discard(bdev,
  439. XFS_FSB_TO_BB(mp, busyp->bno),
  440. XFS_FSB_TO_BB(mp, busyp->length),
  441. GFP_NOFS, &bio);
  442. if (error)
  443. break;
  444. }
  445. xfs_discard_free_rtdev_extents(tr);
  446. if (bio) {
  447. error = submit_bio_wait(bio);
  448. if (error == -EOPNOTSUPP)
  449. error = 0;
  450. if (error)
  451. xfs_info(mp,
  452. "discard failed for rtextent [0x%llx,%llu], error %d",
  453. (unsigned long long)start,
  454. (unsigned long long)length,
  455. error);
  456. bio_put(bio);
  457. }
  458. blk_finish_plug(&plug);
  459. return error;
  460. }
  461. static int
  462. xfs_trim_gather_rtextent(
  463. struct xfs_mount *mp,
  464. struct xfs_trans *tp,
  465. const struct xfs_rtalloc_rec *rec,
  466. void *priv)
  467. {
  468. struct xfs_trim_rtdev *tr = priv;
  469. struct xfs_rtx_busy *busyp;
  470. xfs_rtblock_t rbno, rlen;
  471. if (rec->ar_startext > tr->stop_rtx) {
  472. /*
  473. * If we've scanned a large number of rtbitmap blocks, update
  474. * the cursor to point at this extent so we restart the next
  475. * batch from this extent.
  476. */
  477. tr->restart_rtx = rec->ar_startext;
  478. return -ECANCELED;
  479. }
  480. rbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
  481. rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
  482. /* Ignore too small. */
  483. if (rlen < tr->minlen_fsb) {
  484. trace_xfs_discard_rttoosmall(mp, rbno, rlen);
  485. return 0;
  486. }
  487. busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
  488. if (!busyp)
  489. return -ENOMEM;
  490. busyp->bno = rbno;
  491. busyp->length = rlen;
  492. INIT_LIST_HEAD(&busyp->list);
  493. list_add_tail(&busyp->list, &tr->extent_list);
  494. tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
  495. return 0;
  496. }
  497. static int
  498. xfs_trim_rtdev_extents(
  499. struct xfs_mount *mp,
  500. xfs_daddr_t start,
  501. xfs_daddr_t end,
  502. xfs_daddr_t minlen)
  503. {
  504. struct xfs_trim_rtdev tr = {
  505. .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
  506. };
  507. xfs_rtxnum_t low, high;
  508. struct xfs_trans *tp;
  509. xfs_daddr_t rtdev_daddr;
  510. int error;
  511. INIT_LIST_HEAD(&tr.extent_list);
  512. /* Shift the start and end downwards to match the rt device. */
  513. rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
  514. if (start > rtdev_daddr)
  515. start -= rtdev_daddr;
  516. else
  517. start = 0;
  518. if (end <= rtdev_daddr)
  519. return 0;
  520. end -= rtdev_daddr;
  521. error = xfs_trans_alloc_empty(mp, &tp);
  522. if (error)
  523. return error;
  524. end = min_t(xfs_daddr_t, end,
  525. XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1);
  526. /* Convert the rt blocks to rt extents */
  527. low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
  528. high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
  529. /*
  530. * Walk the free ranges between low and high. The query_range function
  531. * trims the extents returned.
  532. */
  533. do {
  534. tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY);
  535. xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
  536. error = xfs_rtalloc_query_range(mp, tp, low, high,
  537. xfs_trim_gather_rtextent, &tr);
  538. if (error == -ECANCELED)
  539. error = 0;
  540. if (error) {
  541. xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
  542. xfs_discard_free_rtdev_extents(&tr);
  543. break;
  544. }
  545. if (list_empty(&tr.extent_list)) {
  546. xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
  547. break;
  548. }
  549. error = xfs_discard_rtdev_extents(mp, &tr);
  550. xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
  551. if (error)
  552. break;
  553. low = tr.restart_rtx;
  554. } while (!xfs_trim_should_stop() && low <= high);
  555. xfs_trans_cancel(tp);
  556. return error;
  557. }
  558. #else
  559. # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
  560. #endif /* CONFIG_XFS_RT */
  561. /*
  562. * trim a range of the filesystem.
  563. *
  564. * Note: the parameters passed from userspace are byte ranges into the
  565. * filesystem which does not match to the format we use for filesystem block
  566. * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
  567. * is a linear address range. Hence we need to use DADDR based conversions and
  568. * comparisons for determining the correct offset and regions to trim.
  569. *
  570. * The realtime device is mapped into the FITRIM "address space" immediately
  571. * after the data device.
  572. */
  573. int
  574. xfs_ioc_trim(
  575. struct xfs_mount *mp,
  576. struct fstrim_range __user *urange)
  577. {
  578. unsigned int granularity =
  579. bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
  580. struct block_device *rt_bdev = NULL;
  581. struct fstrim_range range;
  582. xfs_daddr_t start, end;
  583. xfs_extlen_t minlen;
  584. xfs_rfsblock_t max_blocks;
  585. int error, last_error = 0;
  586. if (!capable(CAP_SYS_ADMIN))
  587. return -EPERM;
  588. if (mp->m_rtdev_targp &&
  589. bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
  590. rt_bdev = mp->m_rtdev_targp->bt_bdev;
  591. if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
  592. return -EOPNOTSUPP;
  593. if (rt_bdev)
  594. granularity = max(granularity,
  595. bdev_discard_granularity(rt_bdev));
  596. /*
  597. * We haven't recovered the log, so we cannot use our bnobt-guided
  598. * storage zapping commands.
  599. */
  600. if (xfs_has_norecovery(mp))
  601. return -EROFS;
  602. if (copy_from_user(&range, urange, sizeof(range)))
  603. return -EFAULT;
  604. range.minlen = max_t(u64, granularity, range.minlen);
  605. minlen = XFS_B_TO_FSB(mp, range.minlen);
  606. /*
  607. * Truncating down the len isn't actually quite correct, but using
  608. * BBTOB would mean we trivially get overflows for values
  609. * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
  610. * used by the fstrim application. In the end it really doesn't
  611. * matter as trimming blocks is an advisory interface.
  612. */
  613. max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
  614. if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
  615. range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
  616. range.len < mp->m_sb.sb_blocksize)
  617. return -EINVAL;
  618. start = BTOBB(range.start);
  619. end = start + BTOBBT(range.len) - 1;
  620. if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
  621. error = xfs_trim_datadev_extents(mp, start, end, minlen);
  622. if (error)
  623. last_error = error;
  624. }
  625. if (rt_bdev && !xfs_trim_should_stop()) {
  626. error = xfs_trim_rtdev_extents(mp, start, end, minlen);
  627. if (error)
  628. last_error = error;
  629. }
  630. if (last_error)
  631. return last_error;
  632. range.len = min_t(unsigned long long, range.len,
  633. XFS_FSB_TO_B(mp, max_blocks) - range.start);
  634. if (copy_to_user(urange, &range, sizeof(range)))
  635. return -EFAULT;
  636. return 0;
  637. }