fscounters.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <djwong@kernel.org>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_log_format.h"
  12. #include "xfs_trans.h"
  13. #include "xfs_mount.h"
  14. #include "xfs_alloc.h"
  15. #include "xfs_ialloc.h"
  16. #include "xfs_health.h"
  17. #include "xfs_btree.h"
  18. #include "xfs_ag.h"
  19. #include "xfs_rtbitmap.h"
  20. #include "xfs_inode.h"
  21. #include "xfs_icache.h"
  22. #include "scrub/scrub.h"
  23. #include "scrub/common.h"
  24. #include "scrub/trace.h"
  25. #include "scrub/fscounters.h"
  26. /*
  27. * FS Summary Counters
  28. * ===================
  29. *
  30. * The basics of filesystem summary counter checking are that we iterate the
  31. * AGs counting the number of free blocks, free space btree blocks, per-AG
  32. * reservations, inodes, delayed allocation reservations, and free inodes.
  33. * Then we compare what we computed against the in-core counters.
  34. *
  35. * However, the reality is that summary counters are a tricky beast to check.
  36. * While we /could/ freeze the filesystem and scramble around the AGs counting
  37. * the free blocks, in practice we prefer not do that for a scan because
  38. * freezing is costly. To get around this, we added a per-cpu counter of the
  39. * delalloc reservations so that we can rotor around the AGs relatively
  40. * quickly, and we allow the counts to be slightly off because we're not taking
  41. * any locks while we do this.
  42. *
  43. * So the first thing we do is warm up the buffer cache in the setup routine by
  44. * walking all the AGs to make sure the incore per-AG structure has been
  45. * initialized. The expected value calculation then iterates the incore per-AG
  46. * structures as quickly as it can. We snapshot the percpu counters before and
  47. * after this operation and use the difference in counter values to guess at
  48. * our tolerance for mismatch between expected and actual counter values.
  49. */
  50. /*
  51. * Since the expected value computation is lockless but only browses incore
  52. * values, the percpu counters should be fairly close to each other. However,
  53. * we'll allow ourselves to be off by at least this (arbitrary) amount.
  54. */
  55. #define XCHK_FSCOUNT_MIN_VARIANCE (512)
  56. /*
  57. * Make sure the per-AG structure has been initialized from the on-disk header
  58. * contents and trust that the incore counters match the ondisk counters. (The
  59. * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
  60. * summary counters after checking all AG headers). Do this from the setup
  61. * function so that the inner AG aggregation loop runs as quickly as possible.
  62. *
  63. * This function runs during the setup phase /before/ we start checking any
  64. * metadata.
  65. */
  66. STATIC int
  67. xchk_fscount_warmup(
  68. struct xfs_scrub *sc)
  69. {
  70. struct xfs_mount *mp = sc->mp;
  71. struct xfs_buf *agi_bp = NULL;
  72. struct xfs_buf *agf_bp = NULL;
  73. struct xfs_perag *pag = NULL;
  74. xfs_agnumber_t agno;
  75. int error = 0;
  76. for_each_perag(mp, agno, pag) {
  77. if (xchk_should_terminate(sc, &error))
  78. break;
  79. if (xfs_perag_initialised_agi(pag) &&
  80. xfs_perag_initialised_agf(pag))
  81. continue;
  82. /* Lock both AG headers. */
  83. error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
  84. if (error)
  85. break;
  86. error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
  87. if (error)
  88. break;
  89. /*
  90. * These are supposed to be initialized by the header read
  91. * function.
  92. */
  93. if (!xfs_perag_initialised_agi(pag) ||
  94. !xfs_perag_initialised_agf(pag)) {
  95. error = -EFSCORRUPTED;
  96. break;
  97. }
  98. xfs_buf_relse(agf_bp);
  99. agf_bp = NULL;
  100. xfs_buf_relse(agi_bp);
  101. agi_bp = NULL;
  102. }
  103. if (agf_bp)
  104. xfs_buf_relse(agf_bp);
  105. if (agi_bp)
  106. xfs_buf_relse(agi_bp);
  107. if (pag)
  108. xfs_perag_rele(pag);
  109. return error;
  110. }
  111. static inline int
  112. xchk_fsfreeze(
  113. struct xfs_scrub *sc)
  114. {
  115. int error;
  116. error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
  117. trace_xchk_fsfreeze(sc, error);
  118. return error;
  119. }
  120. static inline int
  121. xchk_fsthaw(
  122. struct xfs_scrub *sc)
  123. {
  124. int error;
  125. /* This should always succeed, we have a kernel freeze */
  126. error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
  127. trace_xchk_fsthaw(sc, error);
  128. return error;
  129. }
  130. /*
  131. * We couldn't stabilize the filesystem long enough to sample all the variables
  132. * that comprise the summary counters and compare them to the percpu counters.
  133. * We need to disable all writer threads, which means taking the first two
  134. * freeze levels to put userspace to sleep, and the third freeze level to
  135. * prevent background threads from starting new transactions. Take one level
  136. * more to prevent other callers from unfreezing the filesystem while we run.
  137. */
  138. STATIC int
  139. xchk_fscounters_freeze(
  140. struct xfs_scrub *sc)
  141. {
  142. struct xchk_fscounters *fsc = sc->buf;
  143. int error = 0;
  144. if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
  145. sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
  146. mnt_drop_write_file(sc->file);
  147. }
  148. /* Try to grab a kernel freeze. */
  149. while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
  150. if (xchk_should_terminate(sc, &error))
  151. return error;
  152. delay(HZ / 10);
  153. }
  154. if (error)
  155. return error;
  156. fsc->frozen = true;
  157. return 0;
  158. }
  159. /* Thaw the filesystem after checking or repairing fscounters. */
  160. STATIC void
  161. xchk_fscounters_cleanup(
  162. void *buf)
  163. {
  164. struct xchk_fscounters *fsc = buf;
  165. struct xfs_scrub *sc = fsc->sc;
  166. int error;
  167. if (!fsc->frozen)
  168. return;
  169. error = xchk_fsthaw(sc);
  170. if (error)
  171. xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
  172. else
  173. fsc->frozen = false;
  174. }
  175. int
  176. xchk_setup_fscounters(
  177. struct xfs_scrub *sc)
  178. {
  179. struct xchk_fscounters *fsc;
  180. int error;
  181. /*
  182. * If the AGF doesn't track btreeblks, we have to lock the AGF to count
  183. * btree block usage by walking the actual btrees.
  184. */
  185. if (!xfs_has_lazysbcount(sc->mp))
  186. xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
  187. sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
  188. if (!sc->buf)
  189. return -ENOMEM;
  190. sc->buf_cleanup = xchk_fscounters_cleanup;
  191. fsc = sc->buf;
  192. fsc->sc = sc;
  193. xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
  194. /* We must get the incore counters set up before we can proceed. */
  195. error = xchk_fscount_warmup(sc);
  196. if (error)
  197. return error;
  198. /*
  199. * Pause all writer activity in the filesystem while we're scrubbing to
  200. * reduce the likelihood of background perturbations to the counters
  201. * throwing off our calculations.
  202. *
  203. * If we're repairing, we need to prevent any other thread from
  204. * changing the global fs summary counters while we're repairing them.
  205. * This requires the fs to be frozen, which will disable background
  206. * reclaim and purge all inactive inodes.
  207. */
  208. if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
  209. error = xchk_fscounters_freeze(sc);
  210. if (error)
  211. return error;
  212. }
  213. return xchk_trans_alloc_empty(sc);
  214. }
  215. /*
  216. * Part 1: Collecting filesystem summary counts. For each AG, we add its
  217. * summary counts (total inodes, free inodes, free data blocks) to an incore
  218. * copy of the overall filesystem summary counts.
  219. *
  220. * To avoid false corruption reports in part 2, any failure in this part must
  221. * set the INCOMPLETE flag even when a negative errno is returned. This care
  222. * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
  223. * ECANCELED) that are absorbed into a scrub state flag update by
  224. * xchk_*_process_error. Scrub and repair share the same incore data
  225. * structures, so the INCOMPLETE flag is critical to prevent a repair based on
  226. * insufficient information.
  227. */
  228. /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
  229. static int
  230. xchk_fscount_btreeblks(
  231. struct xfs_scrub *sc,
  232. struct xchk_fscounters *fsc,
  233. xfs_agnumber_t agno)
  234. {
  235. xfs_filblks_t blocks;
  236. int error;
  237. error = xchk_ag_init_existing(sc, agno, &sc->sa);
  238. if (error)
  239. goto out_free;
  240. error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
  241. if (error)
  242. goto out_free;
  243. fsc->fdblocks += blocks - 1;
  244. error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
  245. if (error)
  246. goto out_free;
  247. fsc->fdblocks += blocks - 1;
  248. out_free:
  249. xchk_ag_free(sc, &sc->sa);
  250. return error;
  251. }
  252. /*
  253. * Calculate what the global in-core counters ought to be from the incore
  254. * per-AG structure. Callers can compare this to the actual in-core counters
  255. * to estimate by how much both in-core and on-disk counters need to be
  256. * adjusted.
  257. */
  258. STATIC int
  259. xchk_fscount_aggregate_agcounts(
  260. struct xfs_scrub *sc,
  261. struct xchk_fscounters *fsc)
  262. {
  263. struct xfs_mount *mp = sc->mp;
  264. struct xfs_perag *pag;
  265. uint64_t delayed;
  266. xfs_agnumber_t agno;
  267. int tries = 8;
  268. int error = 0;
  269. retry:
  270. fsc->icount = 0;
  271. fsc->ifree = 0;
  272. fsc->fdblocks = 0;
  273. for_each_perag(mp, agno, pag) {
  274. if (xchk_should_terminate(sc, &error))
  275. break;
  276. /* This somehow got unset since the warmup? */
  277. if (!xfs_perag_initialised_agi(pag) ||
  278. !xfs_perag_initialised_agf(pag)) {
  279. error = -EFSCORRUPTED;
  280. break;
  281. }
  282. /* Count all the inodes */
  283. fsc->icount += pag->pagi_count;
  284. fsc->ifree += pag->pagi_freecount;
  285. /* Add up the free/freelist/bnobt/cntbt blocks */
  286. fsc->fdblocks += pag->pagf_freeblks;
  287. fsc->fdblocks += pag->pagf_flcount;
  288. if (xfs_has_lazysbcount(sc->mp)) {
  289. fsc->fdblocks += pag->pagf_btreeblks;
  290. } else {
  291. error = xchk_fscount_btreeblks(sc, fsc, agno);
  292. if (error)
  293. break;
  294. }
  295. /*
  296. * Per-AG reservations are taken out of the incore counters,
  297. * so they must be left out of the free blocks computation.
  298. */
  299. fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
  300. fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
  301. }
  302. if (pag)
  303. xfs_perag_rele(pag);
  304. if (error) {
  305. xchk_set_incomplete(sc);
  306. return error;
  307. }
  308. /*
  309. * The global incore space reservation is taken from the incore
  310. * counters, so leave that out of the computation.
  311. */
  312. fsc->fdblocks -= mp->m_resblks_avail;
  313. /*
  314. * Delayed allocation reservations are taken out of the incore counters
  315. * but not recorded on disk, so leave them and their indlen blocks out
  316. * of the computation.
  317. */
  318. delayed = percpu_counter_sum(&mp->m_delalloc_blks);
  319. fsc->fdblocks -= delayed;
  320. trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
  321. delayed);
  322. /* Bail out if the values we compute are totally nonsense. */
  323. if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
  324. fsc->fdblocks > mp->m_sb.sb_dblocks ||
  325. fsc->ifree > fsc->icount_max)
  326. return -EFSCORRUPTED;
  327. /*
  328. * If ifree > icount then we probably had some perturbation in the
  329. * counters while we were calculating things. We'll try a few times
  330. * to maintain ifree <= icount before giving up.
  331. */
  332. if (fsc->ifree > fsc->icount) {
  333. if (tries--)
  334. goto retry;
  335. return -EDEADLOCK;
  336. }
  337. return 0;
  338. }
  339. #ifdef CONFIG_XFS_RT
  340. STATIC int
  341. xchk_fscount_add_frextent(
  342. struct xfs_mount *mp,
  343. struct xfs_trans *tp,
  344. const struct xfs_rtalloc_rec *rec,
  345. void *priv)
  346. {
  347. struct xchk_fscounters *fsc = priv;
  348. int error = 0;
  349. fsc->frextents += rec->ar_extcount;
  350. xchk_should_terminate(fsc->sc, &error);
  351. return error;
  352. }
  353. /* Calculate the number of free realtime extents from the realtime bitmap. */
  354. STATIC int
  355. xchk_fscount_count_frextents(
  356. struct xfs_scrub *sc,
  357. struct xchk_fscounters *fsc)
  358. {
  359. struct xfs_mount *mp = sc->mp;
  360. int error;
  361. fsc->frextents = 0;
  362. fsc->frextents_delayed = 0;
  363. if (!xfs_has_realtime(mp))
  364. return 0;
  365. xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
  366. error = xfs_rtalloc_query_all(sc->mp, sc->tp,
  367. xchk_fscount_add_frextent, fsc);
  368. if (error) {
  369. xchk_set_incomplete(sc);
  370. goto out_unlock;
  371. }
  372. fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
  373. out_unlock:
  374. xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
  375. return error;
  376. }
  377. #else
  378. STATIC int
  379. xchk_fscount_count_frextents(
  380. struct xfs_scrub *sc,
  381. struct xchk_fscounters *fsc)
  382. {
  383. fsc->frextents = 0;
  384. fsc->frextents_delayed = 0;
  385. return 0;
  386. }
  387. #endif /* CONFIG_XFS_RT */
  388. /*
  389. * Part 2: Comparing filesystem summary counters. All we have to do here is
  390. * sum the percpu counters and compare them to what we've observed.
  391. */
  392. /*
  393. * Is the @counter reasonably close to the @expected value?
  394. *
  395. * We neither locked nor froze anything in the filesystem while aggregating the
  396. * per-AG data to compute the @expected value, which means that the counter
  397. * could have changed. We know the @old_value of the summation of the counter
  398. * before the aggregation, and we re-sum the counter now. If the expected
  399. * value falls between the two summations, we're ok.
  400. *
  401. * Otherwise, we /might/ have a problem. If the change in the summations is
  402. * more than we want to tolerate, the filesystem is probably busy and we should
  403. * just send back INCOMPLETE and see if userspace will try again.
  404. *
  405. * If we're repairing then we require an exact match.
  406. */
  407. static inline bool
  408. xchk_fscount_within_range(
  409. struct xfs_scrub *sc,
  410. const int64_t old_value,
  411. struct percpu_counter *counter,
  412. uint64_t expected)
  413. {
  414. int64_t min_value, max_value;
  415. int64_t curr_value = percpu_counter_sum(counter);
  416. trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
  417. old_value);
  418. /* Negative values are always wrong. */
  419. if (curr_value < 0)
  420. return false;
  421. /* Exact matches are always ok. */
  422. if (curr_value == expected)
  423. return true;
  424. /* We require exact matches when repair is running. */
  425. if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
  426. return false;
  427. min_value = min(old_value, curr_value);
  428. max_value = max(old_value, curr_value);
  429. /* Within the before-and-after range is ok. */
  430. if (expected >= min_value && expected <= max_value)
  431. return true;
  432. /* Everything else is bad. */
  433. return false;
  434. }
  435. /* Check the superblock counters. */
  436. int
  437. xchk_fscounters(
  438. struct xfs_scrub *sc)
  439. {
  440. struct xfs_mount *mp = sc->mp;
  441. struct xchk_fscounters *fsc = sc->buf;
  442. int64_t icount, ifree, fdblocks, frextents;
  443. bool try_again = false;
  444. int error;
  445. /* Snapshot the percpu counters. */
  446. icount = percpu_counter_sum(&mp->m_icount);
  447. ifree = percpu_counter_sum(&mp->m_ifree);
  448. fdblocks = percpu_counter_sum(&mp->m_fdblocks);
  449. frextents = percpu_counter_sum(&mp->m_frextents);
  450. /* No negative values, please! */
  451. if (icount < 0 || ifree < 0)
  452. xchk_set_corrupt(sc);
  453. /*
  454. * If the filesystem is not frozen, the counter summation calls above
  455. * can race with xfs_dec_freecounter, which subtracts a requested space
  456. * reservation from the counter and undoes the subtraction if that made
  457. * the counter go negative. Therefore, it's possible to see negative
  458. * values here, and we should only flag that as a corruption if we
  459. * froze the fs. This is much more likely to happen with frextents
  460. * since there are no reserved pools.
  461. */
  462. if (fdblocks < 0 || frextents < 0) {
  463. if (!fsc->frozen)
  464. return -EDEADLOCK;
  465. xchk_set_corrupt(sc);
  466. return 0;
  467. }
  468. /* See if icount is obviously wrong. */
  469. if (icount < fsc->icount_min || icount > fsc->icount_max)
  470. xchk_set_corrupt(sc);
  471. /* See if fdblocks is obviously wrong. */
  472. if (fdblocks > mp->m_sb.sb_dblocks)
  473. xchk_set_corrupt(sc);
  474. /* See if frextents is obviously wrong. */
  475. if (frextents > mp->m_sb.sb_rextents)
  476. xchk_set_corrupt(sc);
  477. /*
  478. * If ifree exceeds icount by more than the minimum variance then
  479. * something's probably wrong with the counters.
  480. */
  481. if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
  482. xchk_set_corrupt(sc);
  483. /* Walk the incore AG headers to calculate the expected counters. */
  484. error = xchk_fscount_aggregate_agcounts(sc, fsc);
  485. if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
  486. return error;
  487. /* Count the free extents counter for rt volumes. */
  488. error = xchk_fscount_count_frextents(sc, fsc);
  489. if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
  490. return error;
  491. if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
  492. return 0;
  493. /*
  494. * Compare the in-core counters with whatever we counted. If the fs is
  495. * frozen, we treat the discrepancy as a corruption because the freeze
  496. * should have stabilized the counter values. Otherwise, we need
  497. * userspace to call us back having granted us freeze permission.
  498. */
  499. if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
  500. fsc->icount)) {
  501. if (fsc->frozen)
  502. xchk_set_corrupt(sc);
  503. else
  504. try_again = true;
  505. }
  506. if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
  507. if (fsc->frozen)
  508. xchk_set_corrupt(sc);
  509. else
  510. try_again = true;
  511. }
  512. if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
  513. fsc->fdblocks)) {
  514. if (fsc->frozen)
  515. xchk_set_corrupt(sc);
  516. else
  517. try_again = true;
  518. }
  519. if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
  520. fsc->frextents - fsc->frextents_delayed)) {
  521. if (fsc->frozen)
  522. xchk_set_corrupt(sc);
  523. else
  524. try_again = true;
  525. }
  526. if (try_again)
  527. return -EDEADLOCK;
  528. return 0;
  529. }