alloc_repair.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <djwong@kernel.org>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_defer.h"
  13. #include "xfs_btree.h"
  14. #include "xfs_btree_staging.h"
  15. #include "xfs_bit.h"
  16. #include "xfs_log_format.h"
  17. #include "xfs_trans.h"
  18. #include "xfs_sb.h"
  19. #include "xfs_alloc.h"
  20. #include "xfs_alloc_btree.h"
  21. #include "xfs_rmap.h"
  22. #include "xfs_rmap_btree.h"
  23. #include "xfs_inode.h"
  24. #include "xfs_refcount.h"
  25. #include "xfs_extent_busy.h"
  26. #include "xfs_health.h"
  27. #include "xfs_bmap.h"
  28. #include "xfs_ialloc.h"
  29. #include "xfs_ag.h"
  30. #include "scrub/xfs_scrub.h"
  31. #include "scrub/scrub.h"
  32. #include "scrub/common.h"
  33. #include "scrub/btree.h"
  34. #include "scrub/trace.h"
  35. #include "scrub/repair.h"
  36. #include "scrub/bitmap.h"
  37. #include "scrub/agb_bitmap.h"
  38. #include "scrub/xfile.h"
  39. #include "scrub/xfarray.h"
  40. #include "scrub/newbt.h"
  41. #include "scrub/reap.h"
  42. /*
  43. * Free Space Btree Repair
  44. * =======================
  45. *
  46. * The reverse mappings are supposed to record all space usage for the entire
  47. * AG. Therefore, we can recreate the free extent records in an AG by looking
  48. * for gaps in the physical extents recorded in the rmapbt. These records are
  49. * staged in @free_records. Identifying the gaps is more difficult on a
  50. * reflink filesystem because rmap records are allowed to overlap.
  51. *
  52. * Because the final step of building a new index is to free the space used by
  53. * the old index, repair needs to find that space. Unfortunately, all
  54. * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
  55. * the same rmapbt owner code (OWN_AG), so this is not straightforward.
  56. *
  57. * The scan of the reverse mapping information records the space used by OWN_AG
  58. * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
  59. * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
  60. * record all visited rmap btree blocks and all blocks owned by the AGFL.
  61. *
  62. * After that is where the definitions of old_allocbt_blocks shifts. This
  63. * expression identifies possible former bnobt/cntbt blocks:
  64. *
  65. * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
  66. *
  67. * Substituting from above definitions, that becomes:
  68. *
  69. * old_allocbt_blocks & ~not_allocbt_blocks
  70. *
  71. * The OWN_AG bitmap itself isn't needed after this point, so what we really do
  72. * instead is:
  73. *
  74. * old_allocbt_blocks &= ~not_allocbt_blocks;
  75. *
  76. * After this point, @old_allocbt_blocks is a bitmap of alleged former
  77. * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
  78. * parameter in place to avoid copying records around.
  79. *
  80. * Next, some of the space described by @free_records are diverted to the newbt
  81. * reservation and used to format new btree blocks. The remaining records are
  82. * written to the new btree indices. We reconstruct both bnobt and cntbt at
  83. * the same time since we've already done all the work.
  84. *
  85. * We use the prefix 'xrep_abt' here because we regenerate both free space
  86. * allocation btrees at the same time.
  87. */
  88. struct xrep_abt {
  89. /* Blocks owned by the rmapbt or the agfl. */
  90. struct xagb_bitmap not_allocbt_blocks;
  91. /* All OWN_AG blocks. */
  92. struct xagb_bitmap old_allocbt_blocks;
  93. /*
  94. * New bnobt information. All btree block reservations are added to
  95. * the reservation list in new_bnobt.
  96. */
  97. struct xrep_newbt new_bnobt;
  98. /* new cntbt information */
  99. struct xrep_newbt new_cntbt;
  100. /* Free space extents. */
  101. struct xfarray *free_records;
  102. struct xfs_scrub *sc;
  103. /* Number of non-null records in @free_records. */
  104. uint64_t nr_real_records;
  105. /* get_records()'s position in the free space record array. */
  106. xfarray_idx_t array_cur;
  107. /*
  108. * Next block we anticipate seeing in the rmap records. If the next
  109. * rmap record is greater than next_agbno, we have found unused space.
  110. */
  111. xfs_agblock_t next_agbno;
  112. /* Number of free blocks in this AG. */
  113. xfs_agblock_t nr_blocks;
  114. /* Longest free extent we found in the AG. */
  115. xfs_agblock_t longest;
  116. };
  117. /* Set up to repair AG free space btrees. */
  118. int
  119. xrep_setup_ag_allocbt(
  120. struct xfs_scrub *sc)
  121. {
  122. unsigned int busy_gen;
  123. /*
  124. * Make sure the busy extent list is clear because we can't put extents
  125. * on there twice.
  126. */
  127. busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
  128. if (xfs_extent_busy_list_empty(sc->sa.pag))
  129. return 0;
  130. return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
  131. }
  132. /* Check for any obvious conflicts in the free extent. */
  133. STATIC int
  134. xrep_abt_check_free_ext(
  135. struct xfs_scrub *sc,
  136. const struct xfs_alloc_rec_incore *rec)
  137. {
  138. enum xbtree_recpacking outcome;
  139. int error;
  140. if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
  141. return -EFSCORRUPTED;
  142. /* Must not be an inode chunk. */
  143. error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
  144. rec->ar_startblock, rec->ar_blockcount, &outcome);
  145. if (error)
  146. return error;
  147. if (outcome != XBTREE_RECPACKING_EMPTY)
  148. return -EFSCORRUPTED;
  149. /* Must not be shared or CoW staging. */
  150. if (sc->sa.refc_cur) {
  151. error = xfs_refcount_has_records(sc->sa.refc_cur,
  152. XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
  153. rec->ar_blockcount, &outcome);
  154. if (error)
  155. return error;
  156. if (outcome != XBTREE_RECPACKING_EMPTY)
  157. return -EFSCORRUPTED;
  158. error = xfs_refcount_has_records(sc->sa.refc_cur,
  159. XFS_REFC_DOMAIN_COW, rec->ar_startblock,
  160. rec->ar_blockcount, &outcome);
  161. if (error)
  162. return error;
  163. if (outcome != XBTREE_RECPACKING_EMPTY)
  164. return -EFSCORRUPTED;
  165. }
  166. return 0;
  167. }
  168. /*
  169. * Stash a free space record for all the space since the last bno we found
  170. * all the way up to @end.
  171. */
  172. static int
  173. xrep_abt_stash(
  174. struct xrep_abt *ra,
  175. xfs_agblock_t end)
  176. {
  177. struct xfs_alloc_rec_incore arec = {
  178. .ar_startblock = ra->next_agbno,
  179. .ar_blockcount = end - ra->next_agbno,
  180. };
  181. struct xfs_scrub *sc = ra->sc;
  182. int error = 0;
  183. if (xchk_should_terminate(sc, &error))
  184. return error;
  185. error = xrep_abt_check_free_ext(ra->sc, &arec);
  186. if (error)
  187. return error;
  188. trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
  189. error = xfarray_append(ra->free_records, &arec);
  190. if (error)
  191. return error;
  192. ra->nr_blocks += arec.ar_blockcount;
  193. return 0;
  194. }
  195. /* Record extents that aren't in use from gaps in the rmap records. */
  196. STATIC int
  197. xrep_abt_walk_rmap(
  198. struct xfs_btree_cur *cur,
  199. const struct xfs_rmap_irec *rec,
  200. void *priv)
  201. {
  202. struct xrep_abt *ra = priv;
  203. int error;
  204. /* Record all the OWN_AG blocks... */
  205. if (rec->rm_owner == XFS_RMAP_OWN_AG) {
  206. error = xagb_bitmap_set(&ra->old_allocbt_blocks,
  207. rec->rm_startblock, rec->rm_blockcount);
  208. if (error)
  209. return error;
  210. }
  211. /* ...and all the rmapbt blocks... */
  212. error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
  213. if (error)
  214. return error;
  215. /* ...and all the free space. */
  216. if (rec->rm_startblock > ra->next_agbno) {
  217. error = xrep_abt_stash(ra, rec->rm_startblock);
  218. if (error)
  219. return error;
  220. }
  221. /*
  222. * rmap records can overlap on reflink filesystems, so project
  223. * next_agbno as far out into the AG space as we currently know about.
  224. */
  225. ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
  226. rec->rm_startblock + rec->rm_blockcount);
  227. return 0;
  228. }
  229. /* Collect an AGFL block for the not-to-release list. */
  230. static int
  231. xrep_abt_walk_agfl(
  232. struct xfs_mount *mp,
  233. xfs_agblock_t agbno,
  234. void *priv)
  235. {
  236. struct xrep_abt *ra = priv;
  237. return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
  238. }
  239. /*
  240. * Compare two free space extents by block number. We want to sort in order of
  241. * increasing block number.
  242. */
  243. static int
  244. xrep_bnobt_extent_cmp(
  245. const void *a,
  246. const void *b)
  247. {
  248. const struct xfs_alloc_rec_incore *ap = a;
  249. const struct xfs_alloc_rec_incore *bp = b;
  250. if (ap->ar_startblock > bp->ar_startblock)
  251. return 1;
  252. else if (ap->ar_startblock < bp->ar_startblock)
  253. return -1;
  254. return 0;
  255. }
  256. /*
  257. * Re-sort the free extents by block number so that we can put the records into
  258. * the bnobt in the correct order. Make sure the records do not overlap in
  259. * physical space.
  260. */
  261. STATIC int
  262. xrep_bnobt_sort_records(
  263. struct xrep_abt *ra)
  264. {
  265. struct xfs_alloc_rec_incore arec;
  266. xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
  267. xfs_agblock_t next_agbno = 0;
  268. int error;
  269. error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
  270. if (error)
  271. return error;
  272. while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
  273. if (arec.ar_startblock < next_agbno)
  274. return -EFSCORRUPTED;
  275. next_agbno = arec.ar_startblock + arec.ar_blockcount;
  276. }
  277. return error;
  278. }
  279. /*
  280. * Compare two free space extents by length and then block number. We want
  281. * to sort first in order of increasing length and then in order of increasing
  282. * block number.
  283. */
  284. static int
  285. xrep_cntbt_extent_cmp(
  286. const void *a,
  287. const void *b)
  288. {
  289. const struct xfs_alloc_rec_incore *ap = a;
  290. const struct xfs_alloc_rec_incore *bp = b;
  291. if (ap->ar_blockcount > bp->ar_blockcount)
  292. return 1;
  293. else if (ap->ar_blockcount < bp->ar_blockcount)
  294. return -1;
  295. return xrep_bnobt_extent_cmp(a, b);
  296. }
  297. /*
  298. * Sort the free extents by length so so that we can put the records into the
  299. * cntbt in the correct order. Don't let userspace kill us if we're resorting
  300. * after allocating btree blocks.
  301. */
  302. STATIC int
  303. xrep_cntbt_sort_records(
  304. struct xrep_abt *ra,
  305. bool is_resort)
  306. {
  307. return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
  308. is_resort ? 0 : XFARRAY_SORT_KILLABLE);
  309. }
  310. /*
  311. * Iterate all reverse mappings to find (1) the gaps between rmap records (all
  312. * unowned space), (2) the OWN_AG extents (which encompass the free space
  313. * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
  314. * blocks. The free space is (1) + (2) - (3) - (4).
  315. */
  316. STATIC int
  317. xrep_abt_find_freespace(
  318. struct xrep_abt *ra)
  319. {
  320. struct xfs_scrub *sc = ra->sc;
  321. struct xfs_mount *mp = sc->mp;
  322. struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
  323. struct xfs_buf *agfl_bp;
  324. xfs_agblock_t agend;
  325. int error;
  326. xagb_bitmap_init(&ra->not_allocbt_blocks);
  327. xrep_ag_btcur_init(sc, &sc->sa);
  328. /*
  329. * Iterate all the reverse mappings to find gaps in the physical
  330. * mappings, all the OWN_AG blocks, and all the rmapbt extents.
  331. */
  332. error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
  333. if (error)
  334. goto err;
  335. /* Insert a record for space between the last rmap and EOAG. */
  336. agend = be32_to_cpu(agf->agf_length);
  337. if (ra->next_agbno < agend) {
  338. error = xrep_abt_stash(ra, agend);
  339. if (error)
  340. goto err;
  341. }
  342. /* Collect all the AGFL blocks. */
  343. error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
  344. if (error)
  345. goto err;
  346. error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
  347. if (error)
  348. goto err_agfl;
  349. /* Compute the old bnobt/cntbt blocks. */
  350. error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
  351. &ra->not_allocbt_blocks);
  352. if (error)
  353. goto err_agfl;
  354. ra->nr_real_records = xfarray_length(ra->free_records);
  355. err_agfl:
  356. xfs_trans_brelse(sc->tp, agfl_bp);
  357. err:
  358. xchk_ag_btcur_free(&sc->sa);
  359. xagb_bitmap_destroy(&ra->not_allocbt_blocks);
  360. return error;
  361. }
  362. /*
  363. * We're going to use the observed free space records to reserve blocks for the
  364. * new free space btrees, so we play an iterative game where we try to converge
  365. * on the number of blocks we need:
  366. *
  367. * 1. Estimate how many blocks we'll need to store the records.
  368. * 2. If the first free record has more blocks than we need, we're done.
  369. * We will have to re-sort the records prior to building the cntbt.
  370. * 3. If that record has exactly the number of blocks we need, null out the
  371. * record. We're done.
  372. * 4. Otherwise, we still need more blocks. Null out the record, subtract its
  373. * length from the number of blocks we need, and go back to step 1.
  374. *
  375. * Fortunately, we don't have to do any transaction work to play this game, so
  376. * we don't have to tear down the staging cursors.
  377. */
  378. STATIC int
  379. xrep_abt_reserve_space(
  380. struct xrep_abt *ra,
  381. struct xfs_btree_cur *bno_cur,
  382. struct xfs_btree_cur *cnt_cur,
  383. bool *needs_resort)
  384. {
  385. struct xfs_scrub *sc = ra->sc;
  386. xfarray_idx_t record_nr;
  387. unsigned int allocated = 0;
  388. int error = 0;
  389. record_nr = xfarray_length(ra->free_records) - 1;
  390. do {
  391. struct xfs_alloc_rec_incore arec;
  392. uint64_t required;
  393. unsigned int desired;
  394. unsigned int len;
  395. /* Compute how many blocks we'll need. */
  396. error = xfs_btree_bload_compute_geometry(cnt_cur,
  397. &ra->new_cntbt.bload, ra->nr_real_records);
  398. if (error)
  399. break;
  400. error = xfs_btree_bload_compute_geometry(bno_cur,
  401. &ra->new_bnobt.bload, ra->nr_real_records);
  402. if (error)
  403. break;
  404. /* How many btree blocks do we need to store all records? */
  405. required = ra->new_bnobt.bload.nr_blocks +
  406. ra->new_cntbt.bload.nr_blocks;
  407. ASSERT(required < INT_MAX);
  408. /* If we've reserved enough blocks, we're done. */
  409. if (allocated >= required)
  410. break;
  411. desired = required - allocated;
  412. /* We need space but there's none left; bye! */
  413. if (ra->nr_real_records == 0) {
  414. error = -ENOSPC;
  415. break;
  416. }
  417. /* Grab the first record from the list. */
  418. error = xfarray_load(ra->free_records, record_nr, &arec);
  419. if (error)
  420. break;
  421. ASSERT(arec.ar_blockcount <= UINT_MAX);
  422. len = min_t(unsigned int, arec.ar_blockcount, desired);
  423. trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
  424. arec.ar_startblock, len, XFS_RMAP_OWN_AG);
  425. error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
  426. arec.ar_startblock, len);
  427. if (error)
  428. break;
  429. allocated += len;
  430. ra->nr_blocks -= len;
  431. if (arec.ar_blockcount > desired) {
  432. /*
  433. * Record has more space than we need. The number of
  434. * free records doesn't change, so shrink the free
  435. * record, inform the caller that the records are no
  436. * longer sorted by length, and exit.
  437. */
  438. arec.ar_startblock += desired;
  439. arec.ar_blockcount -= desired;
  440. error = xfarray_store(ra->free_records, record_nr,
  441. &arec);
  442. if (error)
  443. break;
  444. *needs_resort = true;
  445. return 0;
  446. }
  447. /*
  448. * We're going to use up the entire record, so unset it and
  449. * move on to the next one. This changes the number of free
  450. * records (but doesn't break the sorting order), so we must
  451. * go around the loop once more to re-run _bload_init.
  452. */
  453. error = xfarray_unset(ra->free_records, record_nr);
  454. if (error)
  455. break;
  456. ra->nr_real_records--;
  457. record_nr--;
  458. } while (1);
  459. return error;
  460. }
  461. STATIC int
  462. xrep_abt_dispose_one(
  463. struct xrep_abt *ra,
  464. struct xrep_newbt_resv *resv)
  465. {
  466. struct xfs_scrub *sc = ra->sc;
  467. struct xfs_perag *pag = sc->sa.pag;
  468. xfs_agblock_t free_agbno = resv->agbno + resv->used;
  469. xfs_extlen_t free_aglen = resv->len - resv->used;
  470. int error;
  471. ASSERT(pag == resv->pag);
  472. /* Add a deferred rmap for each extent we used. */
  473. if (resv->used > 0)
  474. xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
  475. resv->used, XFS_RMAP_OWN_AG);
  476. /*
  477. * For each reserved btree block we didn't use, add it to the free
  478. * space btree. We didn't touch fdblocks when we reserved them, so
  479. * we don't touch it now.
  480. */
  481. if (free_aglen == 0)
  482. return 0;
  483. trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
  484. free_aglen, ra->new_bnobt.oinfo.oi_owner);
  485. error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
  486. &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
  487. if (error)
  488. return error;
  489. return xrep_defer_finish(sc);
  490. }
  491. /*
  492. * Deal with all the space we reserved. Blocks that were allocated for the
  493. * free space btrees need to have a (deferred) rmap added for the OWN_AG
  494. * allocation, and blocks that didn't get used can be freed via the usual
  495. * (deferred) means.
  496. */
  497. STATIC void
  498. xrep_abt_dispose_reservations(
  499. struct xrep_abt *ra,
  500. int error)
  501. {
  502. struct xrep_newbt_resv *resv, *n;
  503. if (error)
  504. goto junkit;
  505. list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
  506. error = xrep_abt_dispose_one(ra, resv);
  507. if (error)
  508. goto junkit;
  509. }
  510. junkit:
  511. list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
  512. xfs_perag_put(resv->pag);
  513. list_del(&resv->list);
  514. kfree(resv);
  515. }
  516. xrep_newbt_cancel(&ra->new_bnobt);
  517. xrep_newbt_cancel(&ra->new_cntbt);
  518. }
  519. /* Retrieve free space data for bulk load. */
  520. STATIC int
  521. xrep_abt_get_records(
  522. struct xfs_btree_cur *cur,
  523. unsigned int idx,
  524. struct xfs_btree_block *block,
  525. unsigned int nr_wanted,
  526. void *priv)
  527. {
  528. struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
  529. struct xrep_abt *ra = priv;
  530. union xfs_btree_rec *block_rec;
  531. unsigned int loaded;
  532. int error;
  533. for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
  534. error = xfarray_load_next(ra->free_records, &ra->array_cur,
  535. arec);
  536. if (error)
  537. return error;
  538. ra->longest = max(ra->longest, arec->ar_blockcount);
  539. block_rec = xfs_btree_rec_addr(cur, idx, block);
  540. cur->bc_ops->init_rec_from_cur(cur, block_rec);
  541. }
  542. return loaded;
  543. }
  544. /* Feed one of the new btree blocks to the bulk loader. */
  545. STATIC int
  546. xrep_abt_claim_block(
  547. struct xfs_btree_cur *cur,
  548. union xfs_btree_ptr *ptr,
  549. void *priv)
  550. {
  551. struct xrep_abt *ra = priv;
  552. return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
  553. }
  554. /*
  555. * Reset the AGF counters to reflect the free space btrees that we just
  556. * rebuilt, then reinitialize the per-AG data.
  557. */
  558. STATIC int
  559. xrep_abt_reset_counters(
  560. struct xrep_abt *ra)
  561. {
  562. struct xfs_scrub *sc = ra->sc;
  563. struct xfs_perag *pag = sc->sa.pag;
  564. struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
  565. unsigned int freesp_btreeblks = 0;
  566. /*
  567. * Compute the contribution to agf_btreeblks for the new free space
  568. * btrees. This is the computed btree size minus anything we didn't
  569. * use.
  570. */
  571. freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
  572. freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
  573. freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
  574. freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
  575. /*
  576. * The AGF header contains extra information related to the free space
  577. * btrees, so we must update those fields here.
  578. */
  579. agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
  580. (be32_to_cpu(agf->agf_rmap_blocks) - 1));
  581. agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
  582. agf->agf_longest = cpu_to_be32(ra->longest);
  583. xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
  584. XFS_AGF_LONGEST |
  585. XFS_AGF_FREEBLKS);
  586. /*
  587. * After we commit the new btree to disk, it is possible that the
  588. * process to reap the old btree blocks will race with the AIL trying
  589. * to checkpoint the old btree blocks into the filesystem. If the new
  590. * tree is shorter than the old one, the allocbt write verifier will
  591. * fail and the AIL will shut down the filesystem.
  592. *
  593. * To avoid this, save the old incore btree height values as the alt
  594. * height values before re-initializing the perag info from the updated
  595. * AGF to capture all the new values.
  596. */
  597. pag->pagf_repair_bno_level = pag->pagf_bno_level;
  598. pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
  599. /* Reinitialize with the values we just logged. */
  600. return xrep_reinit_pagf(sc);
  601. }
  602. /*
  603. * Use the collected free space information to stage new free space btrees.
  604. * If this is successful we'll return with the new btree root
  605. * information logged to the repair transaction but not yet committed.
  606. */
  607. STATIC int
  608. xrep_abt_build_new_trees(
  609. struct xrep_abt *ra)
  610. {
  611. struct xfs_scrub *sc = ra->sc;
  612. struct xfs_btree_cur *bno_cur;
  613. struct xfs_btree_cur *cnt_cur;
  614. struct xfs_perag *pag = sc->sa.pag;
  615. bool needs_resort = false;
  616. int error;
  617. /*
  618. * Sort the free extents by length so that we can set up the free space
  619. * btrees in as few extents as possible. This reduces the amount of
  620. * deferred rmap / free work we have to do at the end.
  621. */
  622. error = xrep_cntbt_sort_records(ra, false);
  623. if (error)
  624. return error;
  625. /*
  626. * Prepare to construct the new btree by reserving disk space for the
  627. * new btree and setting up all the accounting information we'll need
  628. * to root the new btree while it's under construction and before we
  629. * attach it to the AG header.
  630. */
  631. xrep_newbt_init_bare(&ra->new_bnobt, sc);
  632. xrep_newbt_init_bare(&ra->new_cntbt, sc);
  633. ra->new_bnobt.bload.get_records = xrep_abt_get_records;
  634. ra->new_cntbt.bload.get_records = xrep_abt_get_records;
  635. ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
  636. ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
  637. /* Allocate cursors for the staged btrees. */
  638. bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
  639. xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
  640. cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
  641. xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
  642. /* Last chance to abort before we start committing fixes. */
  643. if (xchk_should_terminate(sc, &error))
  644. goto err_cur;
  645. /* Reserve the space we'll need for the new btrees. */
  646. error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
  647. if (error)
  648. goto err_cur;
  649. /*
  650. * If we need to re-sort the free extents by length, do so so that we
  651. * can put the records into the cntbt in the correct order.
  652. */
  653. if (needs_resort) {
  654. error = xrep_cntbt_sort_records(ra, needs_resort);
  655. if (error)
  656. goto err_cur;
  657. }
  658. /*
  659. * Due to btree slack factors, it's possible for a new btree to be one
  660. * level taller than the old btree. Update the alternate incore btree
  661. * height so that we don't trip the verifiers when writing the new
  662. * btree blocks to disk.
  663. */
  664. pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
  665. pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
  666. /* Load the free space by length tree. */
  667. ra->array_cur = XFARRAY_CURSOR_INIT;
  668. ra->longest = 0;
  669. error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
  670. if (error)
  671. goto err_levels;
  672. error = xrep_bnobt_sort_records(ra);
  673. if (error)
  674. goto err_levels;
  675. /* Load the free space by block number tree. */
  676. ra->array_cur = XFARRAY_CURSOR_INIT;
  677. error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
  678. if (error)
  679. goto err_levels;
  680. /*
  681. * Install the new btrees in the AG header. After this point the old
  682. * btrees are no longer accessible and the new trees are live.
  683. */
  684. xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
  685. xfs_btree_del_cursor(bno_cur, 0);
  686. xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
  687. xfs_btree_del_cursor(cnt_cur, 0);
  688. /* Reset the AGF counters now that we've changed the btree shape. */
  689. error = xrep_abt_reset_counters(ra);
  690. if (error)
  691. goto err_newbt;
  692. /* Dispose of any unused blocks and the accounting information. */
  693. xrep_abt_dispose_reservations(ra, error);
  694. return xrep_roll_ag_trans(sc);
  695. err_levels:
  696. pag->pagf_repair_bno_level = 0;
  697. pag->pagf_repair_cnt_level = 0;
  698. err_cur:
  699. xfs_btree_del_cursor(cnt_cur, error);
  700. xfs_btree_del_cursor(bno_cur, error);
  701. err_newbt:
  702. xrep_abt_dispose_reservations(ra, error);
  703. return error;
  704. }
  705. /*
  706. * Now that we've logged the roots of the new btrees, invalidate all of the
  707. * old blocks and free them.
  708. */
  709. STATIC int
  710. xrep_abt_remove_old_trees(
  711. struct xrep_abt *ra)
  712. {
  713. struct xfs_perag *pag = ra->sc->sa.pag;
  714. int error;
  715. /* Free the old btree blocks if they're not in use. */
  716. error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
  717. &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
  718. if (error)
  719. return error;
  720. /*
  721. * Now that we've zapped all the old allocbt blocks we can turn off
  722. * the alternate height mechanism.
  723. */
  724. pag->pagf_repair_bno_level = 0;
  725. pag->pagf_repair_cnt_level = 0;
  726. return 0;
  727. }
  728. /* Repair the freespace btrees for some AG. */
  729. int
  730. xrep_allocbt(
  731. struct xfs_scrub *sc)
  732. {
  733. struct xrep_abt *ra;
  734. struct xfs_mount *mp = sc->mp;
  735. char *descr;
  736. int error;
  737. /* We require the rmapbt to rebuild anything. */
  738. if (!xfs_has_rmapbt(mp))
  739. return -EOPNOTSUPP;
  740. ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
  741. if (!ra)
  742. return -ENOMEM;
  743. ra->sc = sc;
  744. /* We rebuild both data structures. */
  745. sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
  746. /*
  747. * Make sure the busy extent list is clear because we can't put extents
  748. * on there twice. In theory we cleared this before we started, but
  749. * let's not risk the filesystem.
  750. */
  751. if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
  752. error = -EDEADLOCK;
  753. goto out_ra;
  754. }
  755. /* Set up enough storage to handle maximally fragmented free space. */
  756. descr = xchk_xfile_ag_descr(sc, "free space records");
  757. error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
  758. sizeof(struct xfs_alloc_rec_incore),
  759. &ra->free_records);
  760. kfree(descr);
  761. if (error)
  762. goto out_ra;
  763. /* Collect the free space data and find the old btree blocks. */
  764. xagb_bitmap_init(&ra->old_allocbt_blocks);
  765. error = xrep_abt_find_freespace(ra);
  766. if (error)
  767. goto out_bitmap;
  768. /* Rebuild the free space information. */
  769. error = xrep_abt_build_new_trees(ra);
  770. if (error)
  771. goto out_bitmap;
  772. /* Kill the old trees. */
  773. error = xrep_abt_remove_old_trees(ra);
  774. if (error)
  775. goto out_bitmap;
  776. out_bitmap:
  777. xagb_bitmap_destroy(&ra->old_allocbt_blocks);
  778. xfarray_destroy(ra->free_records);
  779. out_ra:
  780. kfree(ra);
  781. return error;
  782. }
  783. /* Make sure both btrees are ok after we've rebuilt them. */
  784. int
  785. xrep_revalidate_allocbt(
  786. struct xfs_scrub *sc)
  787. {
  788. __u32 old_type = sc->sm->sm_type;
  789. int error;
  790. /*
  791. * We must update sm_type temporarily so that the tree-to-tree cross
  792. * reference checks will work in the correct direction, and also so
  793. * that tracing will report correctly if there are more errors.
  794. */
  795. sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
  796. error = xchk_allocbt(sc);
  797. if (error)
  798. goto out;
  799. sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
  800. error = xchk_allocbt(sc);
  801. out:
  802. sc->sm->sm_type = old_type;
  803. return error;
  804. }