cow_repair.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <djwong@kernel.org>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_defer.h"
  13. #include "xfs_btree.h"
  14. #include "xfs_log_format.h"
  15. #include "xfs_trans.h"
  16. #include "xfs_inode.h"
  17. #include "xfs_inode_fork.h"
  18. #include "xfs_alloc.h"
  19. #include "xfs_bmap.h"
  20. #include "xfs_rmap.h"
  21. #include "xfs_refcount.h"
  22. #include "xfs_quota.h"
  23. #include "xfs_ialloc.h"
  24. #include "xfs_ag.h"
  25. #include "xfs_error.h"
  26. #include "xfs_errortag.h"
  27. #include "xfs_icache.h"
  28. #include "xfs_refcount_btree.h"
  29. #include "scrub/xfs_scrub.h"
  30. #include "scrub/scrub.h"
  31. #include "scrub/common.h"
  32. #include "scrub/trace.h"
  33. #include "scrub/repair.h"
  34. #include "scrub/bitmap.h"
  35. #include "scrub/off_bitmap.h"
  36. #include "scrub/fsb_bitmap.h"
  37. #include "scrub/reap.h"
  38. /*
  39. * CoW Fork Mapping Repair
  40. * =======================
  41. *
  42. * Although CoW staging extents are owned by incore CoW inode forks, on disk
  43. * they are owned by the refcount btree. The ondisk metadata does not record
  44. * any ownership information, which limits what we can do to repair the
  45. * mappings in the CoW fork. At most, we can replace ifork mappings that lack
  46. * an entry in the refcount btree or are described by a reverse mapping record
  47. * whose owner is not OWN_COW.
  48. *
  49. * Replacing extents is also tricky -- we can't touch written CoW fork extents
  50. * since they are undergoing writeback, and delalloc extents do not require
  51. * repair since they only exist incore. Hence the most we can do is find the
  52. * bad parts of unwritten mappings, allocate a replacement set of blocks, and
  53. * replace the incore mapping. We use the regular reaping process to unmap
  54. * or free the discarded blocks, as appropriate.
  55. */
  56. struct xrep_cow {
  57. struct xfs_scrub *sc;
  58. /* Bitmap of file offset ranges that need replacing. */
  59. struct xoff_bitmap bad_fileoffs;
  60. /* Bitmap of fsblocks that were removed from the CoW fork. */
  61. struct xfsb_bitmap old_cowfork_fsblocks;
  62. /* CoW fork mappings used to scan for bad CoW staging extents. */
  63. struct xfs_bmbt_irec irec;
  64. /* refcount btree block number of irec.br_startblock */
  65. unsigned int irec_startbno;
  66. /* refcount btree block number of the next refcount record we expect */
  67. unsigned int next_bno;
  68. };
  69. /* CoW staging extent. */
  70. struct xrep_cow_extent {
  71. xfs_fsblock_t fsbno;
  72. xfs_extlen_t len;
  73. };
  74. /*
  75. * Mark the part of the file range that corresponds to the given physical
  76. * space. Caller must ensure that the physical range is within xc->irec.
  77. */
  78. STATIC int
  79. xrep_cow_mark_file_range(
  80. struct xrep_cow *xc,
  81. xfs_fsblock_t startblock,
  82. xfs_filblks_t blockcount)
  83. {
  84. xfs_fileoff_t startoff;
  85. startoff = xc->irec.br_startoff +
  86. (startblock - xc->irec.br_startblock);
  87. trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
  88. blockcount);
  89. return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
  90. }
  91. /*
  92. * Trim @src to fit within the CoW fork mapping being examined, and put the
  93. * result in @dst.
  94. */
  95. static inline void
  96. xrep_cow_trim_refcount(
  97. struct xrep_cow *xc,
  98. struct xfs_refcount_irec *dst,
  99. const struct xfs_refcount_irec *src)
  100. {
  101. unsigned int adj;
  102. memcpy(dst, src, sizeof(*dst));
  103. if (dst->rc_startblock < xc->irec_startbno) {
  104. adj = xc->irec_startbno - dst->rc_startblock;
  105. dst->rc_blockcount -= adj;
  106. dst->rc_startblock += adj;
  107. }
  108. if (dst->rc_startblock + dst->rc_blockcount >
  109. xc->irec_startbno + xc->irec.br_blockcount) {
  110. adj = (dst->rc_startblock + dst->rc_blockcount) -
  111. (xc->irec_startbno + xc->irec.br_blockcount);
  112. dst->rc_blockcount -= adj;
  113. }
  114. }
  115. /* Mark any shared CoW staging extents. */
  116. STATIC int
  117. xrep_cow_mark_shared_staging(
  118. struct xfs_btree_cur *cur,
  119. const struct xfs_refcount_irec *rec,
  120. void *priv)
  121. {
  122. struct xrep_cow *xc = priv;
  123. struct xfs_refcount_irec rrec;
  124. xfs_fsblock_t fsbno;
  125. if (!xfs_refcount_check_domain(rec) ||
  126. rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
  127. return -EFSCORRUPTED;
  128. xrep_cow_trim_refcount(xc, &rrec, rec);
  129. fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
  130. rrec.rc_startblock);
  131. return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
  132. }
  133. /*
  134. * Mark any portion of the CoW fork file offset range where there is not a CoW
  135. * staging extent record in the refcountbt, and keep a record of where we did
  136. * find correct refcountbt records. Staging records are always cleaned out at
  137. * mount time, so any two inodes trying to map the same staging area would have
  138. * already taken the fs down due to refcount btree verifier errors. Hence this
  139. * inode should be the sole creator of the staging extent records ondisk.
  140. */
  141. STATIC int
  142. xrep_cow_mark_missing_staging(
  143. struct xfs_btree_cur *cur,
  144. const struct xfs_refcount_irec *rec,
  145. void *priv)
  146. {
  147. struct xrep_cow *xc = priv;
  148. struct xfs_refcount_irec rrec;
  149. int error;
  150. if (!xfs_refcount_check_domain(rec) ||
  151. rec->rc_domain != XFS_REFC_DOMAIN_COW)
  152. return -EFSCORRUPTED;
  153. xrep_cow_trim_refcount(xc, &rrec, rec);
  154. if (xc->next_bno >= rrec.rc_startblock)
  155. goto next;
  156. error = xrep_cow_mark_file_range(xc,
  157. XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
  158. xc->next_bno),
  159. rrec.rc_startblock - xc->next_bno);
  160. if (error)
  161. return error;
  162. next:
  163. xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
  164. return 0;
  165. }
  166. /*
  167. * Mark any area that does not correspond to a CoW staging rmap. These are
  168. * cross-linked areas that must be avoided.
  169. */
  170. STATIC int
  171. xrep_cow_mark_missing_staging_rmap(
  172. struct xfs_btree_cur *cur,
  173. const struct xfs_rmap_irec *rec,
  174. void *priv)
  175. {
  176. struct xrep_cow *xc = priv;
  177. xfs_fsblock_t fsbno;
  178. xfs_agblock_t rec_bno;
  179. xfs_extlen_t rec_len;
  180. unsigned int adj;
  181. if (rec->rm_owner == XFS_RMAP_OWN_COW)
  182. return 0;
  183. rec_bno = rec->rm_startblock;
  184. rec_len = rec->rm_blockcount;
  185. if (rec_bno < xc->irec_startbno) {
  186. adj = xc->irec_startbno - rec_bno;
  187. rec_len -= adj;
  188. rec_bno += adj;
  189. }
  190. if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
  191. adj = (rec_bno + rec_len) -
  192. (xc->irec_startbno + xc->irec.br_blockcount);
  193. rec_len -= adj;
  194. }
  195. fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
  196. return xrep_cow_mark_file_range(xc, fsbno, rec_len);
  197. }
  198. /*
  199. * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
  200. * extent and mark the corresponding part of the file range in the bitmap.
  201. */
  202. STATIC int
  203. xrep_cow_find_bad(
  204. struct xrep_cow *xc)
  205. {
  206. struct xfs_refcount_irec rc_low = { 0 };
  207. struct xfs_refcount_irec rc_high = { 0 };
  208. struct xfs_rmap_irec rm_low = { 0 };
  209. struct xfs_rmap_irec rm_high = { 0 };
  210. struct xfs_perag *pag;
  211. struct xfs_scrub *sc = xc->sc;
  212. xfs_agnumber_t agno;
  213. int error;
  214. agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
  215. xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
  216. pag = xfs_perag_get(sc->mp, agno);
  217. if (!pag)
  218. return -EFSCORRUPTED;
  219. error = xrep_ag_init(sc, pag, &sc->sa);
  220. if (error)
  221. goto out_pag;
  222. /* Mark any CoW fork extents that are shared. */
  223. rc_low.rc_startblock = xc->irec_startbno;
  224. rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
  225. rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
  226. error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
  227. xrep_cow_mark_shared_staging, xc);
  228. if (error)
  229. goto out_sa;
  230. /* Make sure there are CoW staging extents for the whole mapping. */
  231. rc_low.rc_startblock = xc->irec_startbno;
  232. rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
  233. rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
  234. xc->next_bno = xc->irec_startbno;
  235. error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
  236. xrep_cow_mark_missing_staging, xc);
  237. if (error)
  238. goto out_sa;
  239. if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
  240. error = xrep_cow_mark_file_range(xc,
  241. XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
  242. xc->next_bno),
  243. xc->irec_startbno + xc->irec.br_blockcount -
  244. xc->next_bno);
  245. if (error)
  246. goto out_sa;
  247. }
  248. /* Mark any area has an rmap that isn't a COW staging extent. */
  249. rm_low.rm_startblock = xc->irec_startbno;
  250. memset(&rm_high, 0xFF, sizeof(rm_high));
  251. rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
  252. error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
  253. xrep_cow_mark_missing_staging_rmap, xc);
  254. if (error)
  255. goto out_sa;
  256. /*
  257. * If userspace is forcing us to rebuild the CoW fork or someone turned
  258. * on the debugging knob, replace everything in the CoW fork.
  259. */
  260. if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
  261. XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
  262. error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
  263. xc->irec.br_blockcount);
  264. if (error)
  265. return error;
  266. }
  267. out_sa:
  268. xchk_ag_free(sc, &sc->sa);
  269. out_pag:
  270. xfs_perag_put(pag);
  271. return 0;
  272. }
  273. /*
  274. * Allocate a replacement CoW staging extent of up to the given number of
  275. * blocks, and fill out the mapping.
  276. */
  277. STATIC int
  278. xrep_cow_alloc(
  279. struct xfs_scrub *sc,
  280. xfs_extlen_t maxlen,
  281. struct xrep_cow_extent *repl)
  282. {
  283. struct xfs_alloc_arg args = {
  284. .tp = sc->tp,
  285. .mp = sc->mp,
  286. .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
  287. .minlen = 1,
  288. .maxlen = maxlen,
  289. .prod = 1,
  290. .resv = XFS_AG_RESV_NONE,
  291. .datatype = XFS_ALLOC_USERDATA,
  292. };
  293. int error;
  294. error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
  295. if (error)
  296. return error;
  297. error = xfs_alloc_vextent_start_ag(&args,
  298. XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
  299. if (error)
  300. return error;
  301. if (args.fsbno == NULLFSBLOCK)
  302. return -ENOSPC;
  303. xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
  304. repl->fsbno = args.fsbno;
  305. repl->len = args.len;
  306. return 0;
  307. }
  308. /*
  309. * Look up the current CoW fork mapping so that we only allocate enough to
  310. * replace a single mapping. If we don't find a mapping that covers the start
  311. * of the file range, or we find a delalloc or written extent, something is
  312. * seriously wrong, since we didn't drop the ILOCK.
  313. */
  314. static inline int
  315. xrep_cow_find_mapping(
  316. struct xrep_cow *xc,
  317. struct xfs_iext_cursor *icur,
  318. xfs_fileoff_t startoff,
  319. struct xfs_bmbt_irec *got)
  320. {
  321. struct xfs_inode *ip = xc->sc->ip;
  322. struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
  323. if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
  324. goto bad;
  325. if (got->br_startoff > startoff)
  326. goto bad;
  327. if (got->br_blockcount == 0)
  328. goto bad;
  329. if (isnullstartblock(got->br_startblock))
  330. goto bad;
  331. if (xfs_bmap_is_written_extent(got))
  332. goto bad;
  333. return 0;
  334. bad:
  335. ASSERT(0);
  336. return -EFSCORRUPTED;
  337. }
  338. #define REPLACE_LEFT_SIDE (1U << 0)
  339. #define REPLACE_RIGHT_SIDE (1U << 1)
  340. /*
  341. * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
  342. * beginning of @got with the space described by @rep.
  343. */
  344. static inline void
  345. xrep_cow_replace_mapping(
  346. struct xfs_inode *ip,
  347. struct xfs_iext_cursor *icur,
  348. const struct xfs_bmbt_irec *got,
  349. const struct xrep_cow_extent *repl)
  350. {
  351. struct xfs_bmbt_irec new = *got; /* struct copy */
  352. ASSERT(repl->len > 0);
  353. ASSERT(!isnullstartblock(got->br_startblock));
  354. trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
  355. if (got->br_blockcount == repl->len) {
  356. /*
  357. * The new extent is a complete replacement for the existing
  358. * extent. Update the COW fork record.
  359. */
  360. new.br_startblock = repl->fsbno;
  361. xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
  362. return;
  363. }
  364. /*
  365. * The new extent can replace the beginning of the COW fork record.
  366. * Move the left side of @got upwards, then insert the new record.
  367. */
  368. new.br_startoff += repl->len;
  369. new.br_startblock += repl->len;
  370. new.br_blockcount -= repl->len;
  371. xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
  372. new.br_startoff = got->br_startoff;
  373. new.br_startblock = repl->fsbno;
  374. new.br_blockcount = repl->len;
  375. xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
  376. }
  377. /*
  378. * Replace the unwritten CoW staging extent backing the given file range with a
  379. * new space extent that isn't as problematic.
  380. */
  381. STATIC int
  382. xrep_cow_replace_range(
  383. struct xrep_cow *xc,
  384. xfs_fileoff_t startoff,
  385. xfs_extlen_t *blockcount)
  386. {
  387. struct xfs_iext_cursor icur;
  388. struct xrep_cow_extent repl;
  389. struct xfs_bmbt_irec got;
  390. struct xfs_scrub *sc = xc->sc;
  391. xfs_fileoff_t nextoff;
  392. xfs_extlen_t alloc_len;
  393. int error;
  394. /*
  395. * Put the existing CoW fork mapping in @got. If @got ends before
  396. * @rep, truncate @rep so we only replace one extent mapping at a time.
  397. */
  398. error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
  399. if (error)
  400. return error;
  401. nextoff = min(startoff + *blockcount,
  402. got.br_startoff + got.br_blockcount);
  403. /*
  404. * Allocate a replacement extent. If we don't fill all the blocks,
  405. * shorten the quantity that will be deleted in this step.
  406. */
  407. alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
  408. nextoff - startoff);
  409. error = xrep_cow_alloc(sc, alloc_len, &repl);
  410. if (error)
  411. return error;
  412. /*
  413. * Replace the old mapping with the new one, and commit the metadata
  414. * changes made so far.
  415. */
  416. xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
  417. xfs_inode_set_cowblocks_tag(sc->ip);
  418. error = xfs_defer_finish(&sc->tp);
  419. if (error)
  420. return error;
  421. /* Note the old CoW staging extents; we'll reap them all later. */
  422. error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
  423. repl.len);
  424. if (error)
  425. return error;
  426. *blockcount = repl.len;
  427. return 0;
  428. }
  429. /*
  430. * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
  431. * reservation.
  432. */
  433. STATIC int
  434. xrep_cow_replace(
  435. uint64_t startoff,
  436. uint64_t blockcount,
  437. void *priv)
  438. {
  439. struct xrep_cow *xc = priv;
  440. int error = 0;
  441. while (blockcount > 0) {
  442. xfs_extlen_t len = min_t(xfs_filblks_t, blockcount,
  443. XFS_MAX_BMBT_EXTLEN);
  444. error = xrep_cow_replace_range(xc, startoff, &len);
  445. if (error)
  446. break;
  447. blockcount -= len;
  448. startoff += len;
  449. }
  450. return error;
  451. }
  452. /*
  453. * Repair an inode's CoW fork. The CoW fork is an in-core structure, so
  454. * there's no btree to rebuid. Instead, we replace any mappings that are
  455. * cross-linked or lack ondisk CoW fork records in the refcount btree.
  456. */
  457. int
  458. xrep_bmap_cow(
  459. struct xfs_scrub *sc)
  460. {
  461. struct xrep_cow *xc;
  462. struct xfs_iext_cursor icur;
  463. struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
  464. int error;
  465. if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
  466. return -EOPNOTSUPP;
  467. if (!ifp)
  468. return 0;
  469. /* realtime files aren't supported yet */
  470. if (XFS_IS_REALTIME_INODE(sc->ip))
  471. return -EOPNOTSUPP;
  472. /*
  473. * If we're somehow not in extents format, then reinitialize it to
  474. * an empty extent mapping fork and exit.
  475. */
  476. if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
  477. ifp->if_format = XFS_DINODE_FMT_EXTENTS;
  478. ifp->if_nextents = 0;
  479. return 0;
  480. }
  481. xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
  482. if (!xc)
  483. return -ENOMEM;
  484. xfs_trans_ijoin(sc->tp, sc->ip, 0);
  485. xc->sc = sc;
  486. xoff_bitmap_init(&xc->bad_fileoffs);
  487. xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
  488. for_each_xfs_iext(ifp, &icur, &xc->irec) {
  489. if (xchk_should_terminate(sc, &error))
  490. goto out_bitmap;
  491. /*
  492. * delalloc reservations only exist incore, so there is no
  493. * ondisk metadata that we can examine. Hence we leave them
  494. * alone.
  495. */
  496. if (isnullstartblock(xc->irec.br_startblock))
  497. continue;
  498. /*
  499. * COW fork extents are only in the written state if writeback
  500. * is actively writing to disk. We cannot restart the write
  501. * at a different disk address since we've already issued the
  502. * IO, so we leave these alone and hope for the best.
  503. */
  504. if (xfs_bmap_is_written_extent(&xc->irec))
  505. continue;
  506. error = xrep_cow_find_bad(xc);
  507. if (error)
  508. goto out_bitmap;
  509. }
  510. /* Replace any bad unwritten mappings with fresh reservations. */
  511. error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
  512. if (error)
  513. goto out_bitmap;
  514. /*
  515. * Reap as many of the old CoW blocks as we can. They are owned ondisk
  516. * by the refcount btree, not the inode, so it is correct to treat them
  517. * like inode metadata.
  518. */
  519. error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
  520. &XFS_RMAP_OINFO_COW);
  521. if (error)
  522. goto out_bitmap;
  523. out_bitmap:
  524. xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
  525. xoff_bitmap_destroy(&xc->bad_fileoffs);
  526. kfree(xc);
  527. return error;
  528. }