newbt.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <djwong@kernel.org>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_btree.h"
  13. #include "xfs_btree_staging.h"
  14. #include "xfs_log_format.h"
  15. #include "xfs_trans.h"
  16. #include "xfs_sb.h"
  17. #include "xfs_inode.h"
  18. #include "xfs_alloc.h"
  19. #include "xfs_rmap.h"
  20. #include "xfs_ag.h"
  21. #include "xfs_defer.h"
  22. #include "scrub/scrub.h"
  23. #include "scrub/common.h"
  24. #include "scrub/trace.h"
  25. #include "scrub/repair.h"
  26. #include "scrub/newbt.h"
  27. /*
  28. * Estimate proper slack values for a btree that's being reloaded.
  29. *
  30. * Under most circumstances, we'll take whatever default loading value the
  31. * btree bulk loading code calculates for us. However, there are some
  32. * exceptions to this rule:
  33. *
  34. * (0) If someone turned one of the debug knobs.
  35. * (1) If this is a per-AG btree and the AG has less than 10% space free.
  36. * (2) If this is an inode btree and the FS has less than 10% space free.
  37. * In either case, format the new btree blocks almost completely full to
  38. * minimize space usage.
  39. */
  40. static void
  41. xrep_newbt_estimate_slack(
  42. struct xrep_newbt *xnr)
  43. {
  44. struct xfs_scrub *sc = xnr->sc;
  45. struct xfs_btree_bload *bload = &xnr->bload;
  46. uint64_t free;
  47. uint64_t sz;
  48. /*
  49. * The xfs_globals values are set to -1 (i.e. take the bload defaults)
  50. * unless someone has set them otherwise, so we just pull the values
  51. * here.
  52. */
  53. bload->leaf_slack = xfs_globals.bload_leaf_slack;
  54. bload->node_slack = xfs_globals.bload_node_slack;
  55. if (sc->ops->type == ST_PERAG) {
  56. free = sc->sa.pag->pagf_freeblks;
  57. sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
  58. } else {
  59. free = percpu_counter_sum(&sc->mp->m_fdblocks);
  60. sz = sc->mp->m_sb.sb_dblocks;
  61. }
  62. /* No further changes if there's more than 10% free space left. */
  63. if (free >= div_u64(sz, 10))
  64. return;
  65. /*
  66. * We're low on space; load the btrees as tightly as possible. Leave
  67. * a couple of open slots in each btree block so that we don't end up
  68. * splitting the btrees like crazy after a mount.
  69. */
  70. if (bload->leaf_slack < 0)
  71. bload->leaf_slack = 2;
  72. if (bload->node_slack < 0)
  73. bload->node_slack = 2;
  74. }
  75. /* Initialize accounting resources for staging a new AG btree. */
  76. void
  77. xrep_newbt_init_ag(
  78. struct xrep_newbt *xnr,
  79. struct xfs_scrub *sc,
  80. const struct xfs_owner_info *oinfo,
  81. xfs_fsblock_t alloc_hint,
  82. enum xfs_ag_resv_type resv)
  83. {
  84. memset(xnr, 0, sizeof(struct xrep_newbt));
  85. xnr->sc = sc;
  86. xnr->oinfo = *oinfo; /* structure copy */
  87. xnr->alloc_hint = alloc_hint;
  88. xnr->resv = resv;
  89. INIT_LIST_HEAD(&xnr->resv_list);
  90. xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
  91. xrep_newbt_estimate_slack(xnr);
  92. }
  93. /* Initialize accounting resources for staging a new inode fork btree. */
  94. int
  95. xrep_newbt_init_inode(
  96. struct xrep_newbt *xnr,
  97. struct xfs_scrub *sc,
  98. int whichfork,
  99. const struct xfs_owner_info *oinfo)
  100. {
  101. struct xfs_ifork *ifp;
  102. ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
  103. if (!ifp)
  104. return -ENOMEM;
  105. xrep_newbt_init_ag(xnr, sc, oinfo,
  106. XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
  107. XFS_AG_RESV_NONE);
  108. xnr->ifake.if_fork = ifp;
  109. xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
  110. return 0;
  111. }
  112. /*
  113. * Initialize accounting resources for staging a new btree. Callers are
  114. * expected to add their own reservations (and clean them up) manually.
  115. */
  116. void
  117. xrep_newbt_init_bare(
  118. struct xrep_newbt *xnr,
  119. struct xfs_scrub *sc)
  120. {
  121. xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
  122. XFS_AG_RESV_NONE);
  123. }
  124. /*
  125. * Designate specific blocks to be used to build our new btree. @pag must be
  126. * a passive reference.
  127. */
  128. STATIC int
  129. xrep_newbt_add_blocks(
  130. struct xrep_newbt *xnr,
  131. struct xfs_perag *pag,
  132. const struct xfs_alloc_arg *args)
  133. {
  134. struct xfs_mount *mp = xnr->sc->mp;
  135. struct xrep_newbt_resv *resv;
  136. int error;
  137. resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
  138. if (!resv)
  139. return -ENOMEM;
  140. INIT_LIST_HEAD(&resv->list);
  141. resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
  142. resv->len = args->len;
  143. resv->used = 0;
  144. resv->pag = xfs_perag_hold(pag);
  145. if (args->tp) {
  146. ASSERT(xnr->oinfo.oi_offset == 0);
  147. error = xfs_alloc_schedule_autoreap(args,
  148. XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
  149. if (error)
  150. goto out_pag;
  151. }
  152. list_add_tail(&resv->list, &xnr->resv_list);
  153. return 0;
  154. out_pag:
  155. xfs_perag_put(resv->pag);
  156. kfree(resv);
  157. return error;
  158. }
  159. /*
  160. * Add an extent to the new btree reservation pool. Callers are required to
  161. * reap this reservation manually if the repair is cancelled. @pag must be a
  162. * passive reference.
  163. */
  164. int
  165. xrep_newbt_add_extent(
  166. struct xrep_newbt *xnr,
  167. struct xfs_perag *pag,
  168. xfs_agblock_t agbno,
  169. xfs_extlen_t len)
  170. {
  171. struct xfs_mount *mp = xnr->sc->mp;
  172. struct xfs_alloc_arg args = {
  173. .tp = NULL, /* no autoreap */
  174. .oinfo = xnr->oinfo,
  175. .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
  176. .len = len,
  177. .resv = xnr->resv,
  178. };
  179. return xrep_newbt_add_blocks(xnr, pag, &args);
  180. }
  181. /* Don't let our allocation hint take us beyond this AG */
  182. static inline void
  183. xrep_newbt_validate_ag_alloc_hint(
  184. struct xrep_newbt *xnr)
  185. {
  186. struct xfs_scrub *sc = xnr->sc;
  187. xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
  188. if (agno == sc->sa.pag->pag_agno &&
  189. xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
  190. return;
  191. xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
  192. XFS_AGFL_BLOCK(sc->mp) + 1);
  193. }
  194. /* Allocate disk space for a new per-AG btree. */
  195. STATIC int
  196. xrep_newbt_alloc_ag_blocks(
  197. struct xrep_newbt *xnr,
  198. uint64_t nr_blocks)
  199. {
  200. struct xfs_scrub *sc = xnr->sc;
  201. struct xfs_mount *mp = sc->mp;
  202. int error = 0;
  203. ASSERT(sc->sa.pag != NULL);
  204. while (nr_blocks > 0) {
  205. struct xfs_alloc_arg args = {
  206. .tp = sc->tp,
  207. .mp = mp,
  208. .oinfo = xnr->oinfo,
  209. .minlen = 1,
  210. .maxlen = nr_blocks,
  211. .prod = 1,
  212. .resv = xnr->resv,
  213. };
  214. xfs_agnumber_t agno;
  215. xrep_newbt_validate_ag_alloc_hint(xnr);
  216. if (xnr->alloc_vextent)
  217. error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
  218. else
  219. error = xfs_alloc_vextent_near_bno(&args,
  220. xnr->alloc_hint);
  221. if (error)
  222. return error;
  223. if (args.fsbno == NULLFSBLOCK)
  224. return -ENOSPC;
  225. agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
  226. trace_xrep_newbt_alloc_ag_blocks(mp, agno,
  227. XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
  228. xnr->oinfo.oi_owner);
  229. if (agno != sc->sa.pag->pag_agno) {
  230. ASSERT(agno == sc->sa.pag->pag_agno);
  231. return -EFSCORRUPTED;
  232. }
  233. error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
  234. if (error)
  235. return error;
  236. nr_blocks -= args.len;
  237. xnr->alloc_hint = args.fsbno + args.len;
  238. error = xrep_defer_finish(sc);
  239. if (error)
  240. return error;
  241. }
  242. return 0;
  243. }
  244. /* Don't let our allocation hint take us beyond EOFS */
  245. static inline void
  246. xrep_newbt_validate_file_alloc_hint(
  247. struct xrep_newbt *xnr)
  248. {
  249. struct xfs_scrub *sc = xnr->sc;
  250. if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
  251. return;
  252. xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
  253. }
  254. /* Allocate disk space for our new file-based btree. */
  255. STATIC int
  256. xrep_newbt_alloc_file_blocks(
  257. struct xrep_newbt *xnr,
  258. uint64_t nr_blocks)
  259. {
  260. struct xfs_scrub *sc = xnr->sc;
  261. struct xfs_mount *mp = sc->mp;
  262. int error = 0;
  263. while (nr_blocks > 0) {
  264. struct xfs_alloc_arg args = {
  265. .tp = sc->tp,
  266. .mp = mp,
  267. .oinfo = xnr->oinfo,
  268. .minlen = 1,
  269. .maxlen = nr_blocks,
  270. .prod = 1,
  271. .resv = xnr->resv,
  272. };
  273. struct xfs_perag *pag;
  274. xfs_agnumber_t agno;
  275. xrep_newbt_validate_file_alloc_hint(xnr);
  276. if (xnr->alloc_vextent)
  277. error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
  278. else
  279. error = xfs_alloc_vextent_start_ag(&args,
  280. xnr->alloc_hint);
  281. if (error)
  282. return error;
  283. if (args.fsbno == NULLFSBLOCK)
  284. return -ENOSPC;
  285. agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
  286. trace_xrep_newbt_alloc_file_blocks(mp, agno,
  287. XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
  288. xnr->oinfo.oi_owner);
  289. pag = xfs_perag_get(mp, agno);
  290. if (!pag) {
  291. ASSERT(0);
  292. return -EFSCORRUPTED;
  293. }
  294. error = xrep_newbt_add_blocks(xnr, pag, &args);
  295. xfs_perag_put(pag);
  296. if (error)
  297. return error;
  298. nr_blocks -= args.len;
  299. xnr->alloc_hint = args.fsbno + args.len;
  300. error = xrep_defer_finish(sc);
  301. if (error)
  302. return error;
  303. }
  304. return 0;
  305. }
  306. /* Allocate disk space for our new btree. */
  307. int
  308. xrep_newbt_alloc_blocks(
  309. struct xrep_newbt *xnr,
  310. uint64_t nr_blocks)
  311. {
  312. if (xnr->sc->ip)
  313. return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
  314. return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
  315. }
  316. /*
  317. * Free the unused part of a space extent that was reserved for a new ondisk
  318. * structure. Returns the number of EFIs logged or a negative errno.
  319. */
  320. STATIC int
  321. xrep_newbt_free_extent(
  322. struct xrep_newbt *xnr,
  323. struct xrep_newbt_resv *resv,
  324. bool btree_committed)
  325. {
  326. struct xfs_scrub *sc = xnr->sc;
  327. xfs_agblock_t free_agbno = resv->agbno;
  328. xfs_extlen_t free_aglen = resv->len;
  329. xfs_fsblock_t fsbno;
  330. int error;
  331. if (!btree_committed || resv->used == 0) {
  332. /*
  333. * If we're not committing a new btree or we didn't use the
  334. * space reservation, let the existing EFI free the entire
  335. * space extent.
  336. */
  337. trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
  338. free_agbno, free_aglen, xnr->oinfo.oi_owner);
  339. xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
  340. return 1;
  341. }
  342. /*
  343. * We used space and committed the btree. Cancel the autoreap, remove
  344. * the written blocks from the reservation, and possibly log a new EFI
  345. * to free any unused reservation space.
  346. */
  347. xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
  348. free_agbno += resv->used;
  349. free_aglen -= resv->used;
  350. if (free_aglen == 0)
  351. return 0;
  352. trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
  353. free_aglen, xnr->oinfo.oi_owner);
  354. ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
  355. ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
  356. /*
  357. * Use EFIs to free the reservations. This reduces the chance
  358. * that we leak blocks if the system goes down.
  359. */
  360. fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
  361. error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
  362. xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
  363. if (error)
  364. return error;
  365. return 1;
  366. }
  367. /* Free all the accounting info and disk space we reserved for a new btree. */
  368. STATIC int
  369. xrep_newbt_free(
  370. struct xrep_newbt *xnr,
  371. bool btree_committed)
  372. {
  373. struct xfs_scrub *sc = xnr->sc;
  374. struct xrep_newbt_resv *resv, *n;
  375. unsigned int freed = 0;
  376. int error = 0;
  377. /*
  378. * If the filesystem already went down, we can't free the blocks. Skip
  379. * ahead to freeing the incore metadata because we can't fix anything.
  380. */
  381. if (xfs_is_shutdown(sc->mp))
  382. goto junkit;
  383. list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
  384. int ret;
  385. ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
  386. list_del(&resv->list);
  387. xfs_perag_put(resv->pag);
  388. kfree(resv);
  389. if (ret < 0) {
  390. error = ret;
  391. goto junkit;
  392. }
  393. freed += ret;
  394. if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
  395. error = xrep_defer_finish(sc);
  396. if (error)
  397. goto junkit;
  398. freed = 0;
  399. }
  400. }
  401. if (freed)
  402. error = xrep_defer_finish(sc);
  403. junkit:
  404. /*
  405. * If we still have reservations attached to @newbt, cleanup must have
  406. * failed and the filesystem is about to go down. Clean up the incore
  407. * reservations and try to commit to freeing the space we used.
  408. */
  409. list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
  410. xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
  411. list_del(&resv->list);
  412. xfs_perag_put(resv->pag);
  413. kfree(resv);
  414. }
  415. if (sc->ip) {
  416. kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
  417. xnr->ifake.if_fork = NULL;
  418. }
  419. return error;
  420. }
  421. /*
  422. * Free all the accounting info and unused disk space allocations after
  423. * committing a new btree.
  424. */
  425. int
  426. xrep_newbt_commit(
  427. struct xrep_newbt *xnr)
  428. {
  429. return xrep_newbt_free(xnr, true);
  430. }
  431. /*
  432. * Free all the accounting info and all of the disk space we reserved for a new
  433. * btree that we're not going to commit. We want to try to roll things back
  434. * cleanly for things like ENOSPC midway through allocation.
  435. */
  436. void
  437. xrep_newbt_cancel(
  438. struct xrep_newbt *xnr)
  439. {
  440. xrep_newbt_free(xnr, false);
  441. }
  442. /* Feed one of the reserved btree blocks to the bulk loader. */
  443. int
  444. xrep_newbt_claim_block(
  445. struct xfs_btree_cur *cur,
  446. struct xrep_newbt *xnr,
  447. union xfs_btree_ptr *ptr)
  448. {
  449. struct xrep_newbt_resv *resv;
  450. struct xfs_mount *mp = cur->bc_mp;
  451. xfs_agblock_t agbno;
  452. /*
  453. * The first item in the list should always have a free block unless
  454. * we're completely out.
  455. */
  456. resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
  457. if (resv->used == resv->len)
  458. return -ENOSPC;
  459. /*
  460. * Peel off a block from the start of the reservation. We allocate
  461. * blocks in order to place blocks on disk in increasing record or key
  462. * order. The block reservations tend to end up on the list in
  463. * decreasing order, which hopefully results in leaf blocks ending up
  464. * together.
  465. */
  466. agbno = resv->agbno + resv->used;
  467. resv->used++;
  468. /* If we used all the blocks in this reservation, move it to the end. */
  469. if (resv->used == resv->len)
  470. list_move_tail(&resv->list, &xnr->resv_list);
  471. trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
  472. xnr->oinfo.oi_owner);
  473. if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
  474. ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
  475. agbno));
  476. else
  477. ptr->s = cpu_to_be32(agbno);
  478. /* Relog all the EFIs. */
  479. return xrep_defer_finish(xnr->sc);
  480. }
  481. /* How many reserved blocks are unused? */
  482. unsigned int
  483. xrep_newbt_unused_blocks(
  484. struct xrep_newbt *xnr)
  485. {
  486. struct xrep_newbt_resv *resv;
  487. unsigned int unused = 0;
  488. list_for_each_entry(resv, &xnr->resv_list, list)
  489. unused += resv->len - resv->used;
  490. return unused;
  491. }