| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611 |
- // SPDX-License-Identifier: GPL-2.0-or-later
- /*
- * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <djwong@kernel.org>
- */
- #include "xfs.h"
- #include "xfs_fs.h"
- #include "xfs_shared.h"
- #include "xfs_format.h"
- #include "xfs_trans_resv.h"
- #include "xfs_log_format.h"
- #include "xfs_trans.h"
- #include "xfs_mount.h"
- #include "xfs_alloc.h"
- #include "xfs_ialloc.h"
- #include "xfs_health.h"
- #include "xfs_btree.h"
- #include "xfs_ag.h"
- #include "xfs_rtbitmap.h"
- #include "xfs_inode.h"
- #include "xfs_icache.h"
- #include "scrub/scrub.h"
- #include "scrub/common.h"
- #include "scrub/trace.h"
- #include "scrub/fscounters.h"
- /*
- * FS Summary Counters
- * ===================
- *
- * The basics of filesystem summary counter checking are that we iterate the
- * AGs counting the number of free blocks, free space btree blocks, per-AG
- * reservations, inodes, delayed allocation reservations, and free inodes.
- * Then we compare what we computed against the in-core counters.
- *
- * However, the reality is that summary counters are a tricky beast to check.
- * While we /could/ freeze the filesystem and scramble around the AGs counting
- * the free blocks, in practice we prefer not do that for a scan because
- * freezing is costly. To get around this, we added a per-cpu counter of the
- * delalloc reservations so that we can rotor around the AGs relatively
- * quickly, and we allow the counts to be slightly off because we're not taking
- * any locks while we do this.
- *
- * So the first thing we do is warm up the buffer cache in the setup routine by
- * walking all the AGs to make sure the incore per-AG structure has been
- * initialized. The expected value calculation then iterates the incore per-AG
- * structures as quickly as it can. We snapshot the percpu counters before and
- * after this operation and use the difference in counter values to guess at
- * our tolerance for mismatch between expected and actual counter values.
- */
- /*
- * Since the expected value computation is lockless but only browses incore
- * values, the percpu counters should be fairly close to each other. However,
- * we'll allow ourselves to be off by at least this (arbitrary) amount.
- */
- #define XCHK_FSCOUNT_MIN_VARIANCE (512)
- /*
- * Make sure the per-AG structure has been initialized from the on-disk header
- * contents and trust that the incore counters match the ondisk counters. (The
- * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
- * summary counters after checking all AG headers). Do this from the setup
- * function so that the inner AG aggregation loop runs as quickly as possible.
- *
- * This function runs during the setup phase /before/ we start checking any
- * metadata.
- */
- STATIC int
- xchk_fscount_warmup(
- struct xfs_scrub *sc)
- {
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *agi_bp = NULL;
- struct xfs_buf *agf_bp = NULL;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno;
- int error = 0;
- for_each_perag(mp, agno, pag) {
- if (xchk_should_terminate(sc, &error))
- break;
- if (xfs_perag_initialised_agi(pag) &&
- xfs_perag_initialised_agf(pag))
- continue;
- /* Lock both AG headers. */
- error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
- if (error)
- break;
- error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
- if (error)
- break;
- /*
- * These are supposed to be initialized by the header read
- * function.
- */
- if (!xfs_perag_initialised_agi(pag) ||
- !xfs_perag_initialised_agf(pag)) {
- error = -EFSCORRUPTED;
- break;
- }
- xfs_buf_relse(agf_bp);
- agf_bp = NULL;
- xfs_buf_relse(agi_bp);
- agi_bp = NULL;
- }
- if (agf_bp)
- xfs_buf_relse(agf_bp);
- if (agi_bp)
- xfs_buf_relse(agi_bp);
- if (pag)
- xfs_perag_rele(pag);
- return error;
- }
- static inline int
- xchk_fsfreeze(
- struct xfs_scrub *sc)
- {
- int error;
- error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
- trace_xchk_fsfreeze(sc, error);
- return error;
- }
- static inline int
- xchk_fsthaw(
- struct xfs_scrub *sc)
- {
- int error;
- /* This should always succeed, we have a kernel freeze */
- error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
- trace_xchk_fsthaw(sc, error);
- return error;
- }
- /*
- * We couldn't stabilize the filesystem long enough to sample all the variables
- * that comprise the summary counters and compare them to the percpu counters.
- * We need to disable all writer threads, which means taking the first two
- * freeze levels to put userspace to sleep, and the third freeze level to
- * prevent background threads from starting new transactions. Take one level
- * more to prevent other callers from unfreezing the filesystem while we run.
- */
- STATIC int
- xchk_fscounters_freeze(
- struct xfs_scrub *sc)
- {
- struct xchk_fscounters *fsc = sc->buf;
- int error = 0;
- if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
- sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
- mnt_drop_write_file(sc->file);
- }
- /* Try to grab a kernel freeze. */
- while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
- if (xchk_should_terminate(sc, &error))
- return error;
- delay(HZ / 10);
- }
- if (error)
- return error;
- fsc->frozen = true;
- return 0;
- }
- /* Thaw the filesystem after checking or repairing fscounters. */
- STATIC void
- xchk_fscounters_cleanup(
- void *buf)
- {
- struct xchk_fscounters *fsc = buf;
- struct xfs_scrub *sc = fsc->sc;
- int error;
- if (!fsc->frozen)
- return;
- error = xchk_fsthaw(sc);
- if (error)
- xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
- else
- fsc->frozen = false;
- }
- int
- xchk_setup_fscounters(
- struct xfs_scrub *sc)
- {
- struct xchk_fscounters *fsc;
- int error;
- /*
- * If the AGF doesn't track btreeblks, we have to lock the AGF to count
- * btree block usage by walking the actual btrees.
- */
- if (!xfs_has_lazysbcount(sc->mp))
- xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
- sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
- if (!sc->buf)
- return -ENOMEM;
- sc->buf_cleanup = xchk_fscounters_cleanup;
- fsc = sc->buf;
- fsc->sc = sc;
- xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
- /* We must get the incore counters set up before we can proceed. */
- error = xchk_fscount_warmup(sc);
- if (error)
- return error;
- /*
- * Pause all writer activity in the filesystem while we're scrubbing to
- * reduce the likelihood of background perturbations to the counters
- * throwing off our calculations.
- *
- * If we're repairing, we need to prevent any other thread from
- * changing the global fs summary counters while we're repairing them.
- * This requires the fs to be frozen, which will disable background
- * reclaim and purge all inactive inodes.
- */
- if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
- error = xchk_fscounters_freeze(sc);
- if (error)
- return error;
- }
- return xchk_trans_alloc_empty(sc);
- }
- /*
- * Part 1: Collecting filesystem summary counts. For each AG, we add its
- * summary counts (total inodes, free inodes, free data blocks) to an incore
- * copy of the overall filesystem summary counts.
- *
- * To avoid false corruption reports in part 2, any failure in this part must
- * set the INCOMPLETE flag even when a negative errno is returned. This care
- * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
- * ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error. Scrub and repair share the same incore data
- * structures, so the INCOMPLETE flag is critical to prevent a repair based on
- * insufficient information.
- */
- /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
- static int
- xchk_fscount_btreeblks(
- struct xfs_scrub *sc,
- struct xchk_fscounters *fsc,
- xfs_agnumber_t agno)
- {
- xfs_filblks_t blocks;
- int error;
- error = xchk_ag_init_existing(sc, agno, &sc->sa);
- if (error)
- goto out_free;
- error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
- if (error)
- goto out_free;
- fsc->fdblocks += blocks - 1;
- error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
- if (error)
- goto out_free;
- fsc->fdblocks += blocks - 1;
- out_free:
- xchk_ag_free(sc, &sc->sa);
- return error;
- }
- /*
- * Calculate what the global in-core counters ought to be from the incore
- * per-AG structure. Callers can compare this to the actual in-core counters
- * to estimate by how much both in-core and on-disk counters need to be
- * adjusted.
- */
- STATIC int
- xchk_fscount_aggregate_agcounts(
- struct xfs_scrub *sc,
- struct xchk_fscounters *fsc)
- {
- struct xfs_mount *mp = sc->mp;
- struct xfs_perag *pag;
- uint64_t delayed;
- xfs_agnumber_t agno;
- int tries = 8;
- int error = 0;
- retry:
- fsc->icount = 0;
- fsc->ifree = 0;
- fsc->fdblocks = 0;
- for_each_perag(mp, agno, pag) {
- if (xchk_should_terminate(sc, &error))
- break;
- /* This somehow got unset since the warmup? */
- if (!xfs_perag_initialised_agi(pag) ||
- !xfs_perag_initialised_agf(pag)) {
- error = -EFSCORRUPTED;
- break;
- }
- /* Count all the inodes */
- fsc->icount += pag->pagi_count;
- fsc->ifree += pag->pagi_freecount;
- /* Add up the free/freelist/bnobt/cntbt blocks */
- fsc->fdblocks += pag->pagf_freeblks;
- fsc->fdblocks += pag->pagf_flcount;
- if (xfs_has_lazysbcount(sc->mp)) {
- fsc->fdblocks += pag->pagf_btreeblks;
- } else {
- error = xchk_fscount_btreeblks(sc, fsc, agno);
- if (error)
- break;
- }
- /*
- * Per-AG reservations are taken out of the incore counters,
- * so they must be left out of the free blocks computation.
- */
- fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
- fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
- }
- if (pag)
- xfs_perag_rele(pag);
- if (error) {
- xchk_set_incomplete(sc);
- return error;
- }
- /*
- * The global incore space reservation is taken from the incore
- * counters, so leave that out of the computation.
- */
- fsc->fdblocks -= mp->m_resblks_avail;
- /*
- * Delayed allocation reservations are taken out of the incore counters
- * but not recorded on disk, so leave them and their indlen blocks out
- * of the computation.
- */
- delayed = percpu_counter_sum(&mp->m_delalloc_blks);
- fsc->fdblocks -= delayed;
- trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
- delayed);
- /* Bail out if the values we compute are totally nonsense. */
- if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
- fsc->fdblocks > mp->m_sb.sb_dblocks ||
- fsc->ifree > fsc->icount_max)
- return -EFSCORRUPTED;
- /*
- * If ifree > icount then we probably had some perturbation in the
- * counters while we were calculating things. We'll try a few times
- * to maintain ifree <= icount before giving up.
- */
- if (fsc->ifree > fsc->icount) {
- if (tries--)
- goto retry;
- return -EDEADLOCK;
- }
- return 0;
- }
- #ifdef CONFIG_XFS_RT
- STATIC int
- xchk_fscount_add_frextent(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv)
- {
- struct xchk_fscounters *fsc = priv;
- int error = 0;
- fsc->frextents += rec->ar_extcount;
- xchk_should_terminate(fsc->sc, &error);
- return error;
- }
- /* Calculate the number of free realtime extents from the realtime bitmap. */
- STATIC int
- xchk_fscount_count_frextents(
- struct xfs_scrub *sc,
- struct xchk_fscounters *fsc)
- {
- struct xfs_mount *mp = sc->mp;
- int error;
- fsc->frextents = 0;
- fsc->frextents_delayed = 0;
- if (!xfs_has_realtime(mp))
- return 0;
- xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
- error = xfs_rtalloc_query_all(sc->mp, sc->tp,
- xchk_fscount_add_frextent, fsc);
- if (error) {
- xchk_set_incomplete(sc);
- goto out_unlock;
- }
- fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
- out_unlock:
- xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
- return error;
- }
- #else
- STATIC int
- xchk_fscount_count_frextents(
- struct xfs_scrub *sc,
- struct xchk_fscounters *fsc)
- {
- fsc->frextents = 0;
- fsc->frextents_delayed = 0;
- return 0;
- }
- #endif /* CONFIG_XFS_RT */
- /*
- * Part 2: Comparing filesystem summary counters. All we have to do here is
- * sum the percpu counters and compare them to what we've observed.
- */
- /*
- * Is the @counter reasonably close to the @expected value?
- *
- * We neither locked nor froze anything in the filesystem while aggregating the
- * per-AG data to compute the @expected value, which means that the counter
- * could have changed. We know the @old_value of the summation of the counter
- * before the aggregation, and we re-sum the counter now. If the expected
- * value falls between the two summations, we're ok.
- *
- * Otherwise, we /might/ have a problem. If the change in the summations is
- * more than we want to tolerate, the filesystem is probably busy and we should
- * just send back INCOMPLETE and see if userspace will try again.
- *
- * If we're repairing then we require an exact match.
- */
- static inline bool
- xchk_fscount_within_range(
- struct xfs_scrub *sc,
- const int64_t old_value,
- struct percpu_counter *counter,
- uint64_t expected)
- {
- int64_t min_value, max_value;
- int64_t curr_value = percpu_counter_sum(counter);
- trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
- old_value);
- /* Negative values are always wrong. */
- if (curr_value < 0)
- return false;
- /* Exact matches are always ok. */
- if (curr_value == expected)
- return true;
- /* We require exact matches when repair is running. */
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
- return false;
- min_value = min(old_value, curr_value);
- max_value = max(old_value, curr_value);
- /* Within the before-and-after range is ok. */
- if (expected >= min_value && expected <= max_value)
- return true;
- /* Everything else is bad. */
- return false;
- }
- /* Check the superblock counters. */
- int
- xchk_fscounters(
- struct xfs_scrub *sc)
- {
- struct xfs_mount *mp = sc->mp;
- struct xchk_fscounters *fsc = sc->buf;
- int64_t icount, ifree, fdblocks, frextents;
- bool try_again = false;
- int error;
- /* Snapshot the percpu counters. */
- icount = percpu_counter_sum(&mp->m_icount);
- ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
- /* No negative values, please! */
- if (icount < 0 || ifree < 0)
- xchk_set_corrupt(sc);
- /*
- * If the filesystem is not frozen, the counter summation calls above
- * can race with xfs_dec_freecounter, which subtracts a requested space
- * reservation from the counter and undoes the subtraction if that made
- * the counter go negative. Therefore, it's possible to see negative
- * values here, and we should only flag that as a corruption if we
- * froze the fs. This is much more likely to happen with frextents
- * since there are no reserved pools.
- */
- if (fdblocks < 0 || frextents < 0) {
- if (!fsc->frozen)
- return -EDEADLOCK;
- xchk_set_corrupt(sc);
- return 0;
- }
- /* See if icount is obviously wrong. */
- if (icount < fsc->icount_min || icount > fsc->icount_max)
- xchk_set_corrupt(sc);
- /* See if fdblocks is obviously wrong. */
- if (fdblocks > mp->m_sb.sb_dblocks)
- xchk_set_corrupt(sc);
- /* See if frextents is obviously wrong. */
- if (frextents > mp->m_sb.sb_rextents)
- xchk_set_corrupt(sc);
- /*
- * If ifree exceeds icount by more than the minimum variance then
- * something's probably wrong with the counters.
- */
- if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
- xchk_set_corrupt(sc);
- /* Walk the incore AG headers to calculate the expected counters. */
- error = xchk_fscount_aggregate_agcounts(sc, fsc);
- if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
- return error;
- /* Count the free extents counter for rt volumes. */
- error = xchk_fscount_count_frextents(sc, fsc);
- if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
- return 0;
- /*
- * Compare the in-core counters with whatever we counted. If the fs is
- * frozen, we treat the discrepancy as a corruption because the freeze
- * should have stabilized the counter values. Otherwise, we need
- * userspace to call us back having granted us freeze permission.
- */
- if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
- fsc->icount)) {
- if (fsc->frozen)
- xchk_set_corrupt(sc);
- else
- try_again = true;
- }
- if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
- if (fsc->frozen)
- xchk_set_corrupt(sc);
- else
- try_again = true;
- }
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
- if (fsc->frozen)
- xchk_set_corrupt(sc);
- else
- try_again = true;
- }
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents - fsc->frextents_delayed)) {
- if (fsc->frozen)
- xchk_set_corrupt(sc);
- else
- try_again = true;
- }
- if (try_again)
- return -EDEADLOCK;
- return 0;
- }
|