xfs_ag_resv.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * Copyright (C) 2016 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <darrick.wong@oracle.com>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_alloc.h"
  14. #include "xfs_errortag.h"
  15. #include "xfs_error.h"
  16. #include "xfs_trace.h"
  17. #include "xfs_trans.h"
  18. #include "xfs_rmap_btree.h"
  19. #include "xfs_btree.h"
  20. #include "xfs_refcount_btree.h"
  21. #include "xfs_ialloc_btree.h"
  22. #include "xfs_ag.h"
  23. #include "xfs_ag_resv.h"
  24. /*
  25. * Per-AG Block Reservations
  26. *
  27. * For some kinds of allocation group metadata structures, it is advantageous
  28. * to reserve a small number of blocks in each AG so that future expansions of
  29. * that data structure do not encounter ENOSPC because errors during a btree
  30. * split cause the filesystem to go offline.
  31. *
  32. * Prior to the introduction of reflink, this wasn't an issue because the free
  33. * space btrees maintain a reserve of space (the AGFL) to handle any expansion
  34. * that may be necessary; and allocations of other metadata (inodes, BMBT,
  35. * dir/attr) aren't restricted to a single AG. However, with reflink it is
  36. * possible to allocate all the space in an AG, have subsequent reflink/CoW
  37. * activity expand the refcount btree, and discover that there's no space left
  38. * to handle that expansion. Since we can calculate the maximum size of the
  39. * refcount btree, we can reserve space for it and avoid ENOSPC.
  40. *
  41. * Handling per-AG reservations consists of three changes to the allocator's
  42. * behavior: First, because these reservations are always needed, we decrease
  43. * the ag_max_usable counter to reflect the size of the AG after the reserved
  44. * blocks are taken. Second, the reservations must be reflected in the
  45. * fdblocks count to maintain proper accounting. Third, each AG must maintain
  46. * its own reserved block counter so that we can calculate the amount of space
  47. * that must remain free to maintain the reservations. Fourth, the "remaining
  48. * reserved blocks" count must be used when calculating the length of the
  49. * longest free extent in an AG and to clamp maxlen in the per-AG allocation
  50. * functions. In other words, we maintain a virtual allocation via in-core
  51. * accounting tricks so that we don't have to clean up after a crash. :)
  52. *
  53. * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
  54. * values via struct xfs_alloc_arg or directly to the xfs_free_extent
  55. * function. It might seem a little funny to maintain a reservoir of blocks
  56. * to feed another reservoir, but the AGFL only holds enough blocks to get
  57. * through the next transaction. The per-AG reservation is to ensure (we
  58. * hope) that each AG never runs out of blocks. Each data structure wanting
  59. * to use the reservation system should update ask/used in xfs_ag_resv_init.
  60. */
  61. /*
  62. * Are we critically low on blocks? For now we'll define that as the number
  63. * of blocks we can get our hands on being less than 10% of what we reserved
  64. * or less than some arbitrary number (maximum btree height).
  65. */
  66. bool
  67. xfs_ag_resv_critical(
  68. struct xfs_perag *pag,
  69. enum xfs_ag_resv_type type)
  70. {
  71. xfs_extlen_t avail;
  72. xfs_extlen_t orig;
  73. switch (type) {
  74. case XFS_AG_RESV_METADATA:
  75. avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
  76. orig = pag->pag_meta_resv.ar_asked;
  77. break;
  78. case XFS_AG_RESV_RMAPBT:
  79. avail = pag->pagf_freeblks + pag->pagf_flcount -
  80. pag->pag_meta_resv.ar_reserved;
  81. orig = pag->pag_rmapbt_resv.ar_asked;
  82. break;
  83. default:
  84. ASSERT(0);
  85. return false;
  86. }
  87. trace_xfs_ag_resv_critical(pag, type, avail);
  88. /* Critically low if less than 10% or max btree height remains. */
  89. return XFS_TEST_ERROR(avail < orig / 10 ||
  90. avail < pag->pag_mount->m_agbtree_maxlevels,
  91. pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
  92. }
  93. /*
  94. * How many blocks are reserved but not used, and therefore must not be
  95. * allocated away?
  96. */
  97. xfs_extlen_t
  98. xfs_ag_resv_needed(
  99. struct xfs_perag *pag,
  100. enum xfs_ag_resv_type type)
  101. {
  102. xfs_extlen_t len;
  103. len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
  104. switch (type) {
  105. case XFS_AG_RESV_METADATA:
  106. case XFS_AG_RESV_RMAPBT:
  107. len -= xfs_perag_resv(pag, type)->ar_reserved;
  108. break;
  109. case XFS_AG_RESV_NONE:
  110. /* empty */
  111. break;
  112. default:
  113. ASSERT(0);
  114. }
  115. trace_xfs_ag_resv_needed(pag, type, len);
  116. return len;
  117. }
  118. /* Clean out a reservation */
  119. static void
  120. __xfs_ag_resv_free(
  121. struct xfs_perag *pag,
  122. enum xfs_ag_resv_type type)
  123. {
  124. struct xfs_ag_resv *resv;
  125. xfs_extlen_t oldresv;
  126. trace_xfs_ag_resv_free(pag, type, 0);
  127. resv = xfs_perag_resv(pag, type);
  128. if (pag->pag_agno == 0)
  129. pag->pag_mount->m_ag_max_usable += resv->ar_asked;
  130. /*
  131. * RMAPBT blocks come from the AGFL and AGFL blocks are always
  132. * considered "free", so whatever was reserved at mount time must be
  133. * given back at umount.
  134. */
  135. if (type == XFS_AG_RESV_RMAPBT)
  136. oldresv = resv->ar_orig_reserved;
  137. else
  138. oldresv = resv->ar_reserved;
  139. xfs_add_fdblocks(pag->pag_mount, oldresv);
  140. resv->ar_reserved = 0;
  141. resv->ar_asked = 0;
  142. resv->ar_orig_reserved = 0;
  143. }
  144. /* Free a per-AG reservation. */
  145. void
  146. xfs_ag_resv_free(
  147. struct xfs_perag *pag)
  148. {
  149. __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
  150. __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
  151. }
  152. static int
  153. __xfs_ag_resv_init(
  154. struct xfs_perag *pag,
  155. enum xfs_ag_resv_type type,
  156. xfs_extlen_t ask,
  157. xfs_extlen_t used)
  158. {
  159. struct xfs_mount *mp = pag->pag_mount;
  160. struct xfs_ag_resv *resv;
  161. int error;
  162. xfs_extlen_t hidden_space;
  163. if (used > ask)
  164. ask = used;
  165. switch (type) {
  166. case XFS_AG_RESV_RMAPBT:
  167. /*
  168. * Space taken by the rmapbt is not subtracted from fdblocks
  169. * because the rmapbt lives in the free space. Here we must
  170. * subtract the entire reservation from fdblocks so that we
  171. * always have blocks available for rmapbt expansion.
  172. */
  173. hidden_space = ask;
  174. break;
  175. case XFS_AG_RESV_METADATA:
  176. /*
  177. * Space taken by all other metadata btrees are accounted
  178. * on-disk as used space. We therefore only hide the space
  179. * that is reserved but not used by the trees.
  180. */
  181. hidden_space = ask - used;
  182. break;
  183. default:
  184. ASSERT(0);
  185. return -EINVAL;
  186. }
  187. if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
  188. error = -ENOSPC;
  189. else
  190. error = xfs_dec_fdblocks(mp, hidden_space, true);
  191. if (error) {
  192. trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
  193. error, _RET_IP_);
  194. xfs_warn(mp,
  195. "Per-AG reservation for AG %u failed. Filesystem may run out of space.",
  196. pag->pag_agno);
  197. return error;
  198. }
  199. /*
  200. * Reduce the maximum per-AG allocation length by however much we're
  201. * trying to reserve for an AG. Since this is a filesystem-wide
  202. * counter, we only make the adjustment for AG 0. This assumes that
  203. * there aren't any AGs hungrier for per-AG reservation than AG 0.
  204. */
  205. if (pag->pag_agno == 0)
  206. mp->m_ag_max_usable -= ask;
  207. resv = xfs_perag_resv(pag, type);
  208. resv->ar_asked = ask;
  209. resv->ar_orig_reserved = hidden_space;
  210. resv->ar_reserved = ask - used;
  211. trace_xfs_ag_resv_init(pag, type, ask);
  212. return 0;
  213. }
  214. /* Create a per-AG block reservation. */
  215. int
  216. xfs_ag_resv_init(
  217. struct xfs_perag *pag,
  218. struct xfs_trans *tp)
  219. {
  220. struct xfs_mount *mp = pag->pag_mount;
  221. xfs_extlen_t ask;
  222. xfs_extlen_t used;
  223. int error = 0, error2;
  224. bool has_resv = false;
  225. /* Create the metadata reservation. */
  226. if (pag->pag_meta_resv.ar_asked == 0) {
  227. ask = used = 0;
  228. error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
  229. if (error)
  230. goto out;
  231. error = xfs_finobt_calc_reserves(pag, tp, &ask, &used);
  232. if (error)
  233. goto out;
  234. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
  235. ask, used);
  236. if (error) {
  237. /*
  238. * Because we didn't have per-AG reservations when the
  239. * finobt feature was added we might not be able to
  240. * reserve all needed blocks. Warn and fall back to the
  241. * old and potentially buggy code in that case, but
  242. * ensure we do have the reservation for the refcountbt.
  243. */
  244. ask = used = 0;
  245. mp->m_finobt_nores = true;
  246. error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
  247. &used);
  248. if (error)
  249. goto out;
  250. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
  251. ask, used);
  252. if (error)
  253. goto out;
  254. }
  255. if (ask)
  256. has_resv = true;
  257. }
  258. /* Create the RMAPBT metadata reservation */
  259. if (pag->pag_rmapbt_resv.ar_asked == 0) {
  260. ask = used = 0;
  261. error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
  262. if (error)
  263. goto out;
  264. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
  265. if (error)
  266. goto out;
  267. if (ask)
  268. has_resv = true;
  269. }
  270. out:
  271. /*
  272. * Initialize the pagf if we have at least one active reservation on the
  273. * AG. This may have occurred already via reservation calculation, but
  274. * fall back to an explicit init to ensure the in-core allocbt usage
  275. * counters are initialized as soon as possible. This is important
  276. * because filesystems with large perag reservations are susceptible to
  277. * free space reservation problems that the allocbt counter is used to
  278. * address.
  279. */
  280. if (has_resv) {
  281. error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
  282. if (error2)
  283. return error2;
  284. /*
  285. * If there isn't enough space in the AG to satisfy the
  286. * reservation, let the caller know that there wasn't enough
  287. * space. Callers are responsible for deciding what to do
  288. * next, since (in theory) we can stumble along with
  289. * insufficient reservation if data blocks are being freed to
  290. * replenish the AG's free space.
  291. */
  292. if (!error &&
  293. xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
  294. xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
  295. pag->pagf_freeblks + pag->pagf_flcount)
  296. error = -ENOSPC;
  297. }
  298. return error;
  299. }
  300. /* Allocate a block from the reservation. */
  301. void
  302. xfs_ag_resv_alloc_extent(
  303. struct xfs_perag *pag,
  304. enum xfs_ag_resv_type type,
  305. struct xfs_alloc_arg *args)
  306. {
  307. struct xfs_ag_resv *resv;
  308. xfs_extlen_t len;
  309. uint field;
  310. trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
  311. switch (type) {
  312. case XFS_AG_RESV_AGFL:
  313. return;
  314. case XFS_AG_RESV_METADATA:
  315. case XFS_AG_RESV_RMAPBT:
  316. resv = xfs_perag_resv(pag, type);
  317. break;
  318. default:
  319. ASSERT(0);
  320. fallthrough;
  321. case XFS_AG_RESV_NONE:
  322. field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
  323. XFS_TRANS_SB_FDBLOCKS;
  324. xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
  325. return;
  326. }
  327. len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
  328. resv->ar_reserved -= len;
  329. if (type == XFS_AG_RESV_RMAPBT)
  330. return;
  331. /* Allocations of reserved blocks only need on-disk sb updates... */
  332. xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
  333. /* ...but non-reserved blocks need in-core and on-disk updates. */
  334. if (args->len > len)
  335. xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
  336. -((int64_t)args->len - len));
  337. }
  338. /* Free a block to the reservation. */
  339. void
  340. xfs_ag_resv_free_extent(
  341. struct xfs_perag *pag,
  342. enum xfs_ag_resv_type type,
  343. struct xfs_trans *tp,
  344. xfs_extlen_t len)
  345. {
  346. xfs_extlen_t leftover;
  347. struct xfs_ag_resv *resv;
  348. trace_xfs_ag_resv_free_extent(pag, type, len);
  349. switch (type) {
  350. case XFS_AG_RESV_AGFL:
  351. return;
  352. case XFS_AG_RESV_METADATA:
  353. case XFS_AG_RESV_RMAPBT:
  354. resv = xfs_perag_resv(pag, type);
  355. break;
  356. default:
  357. ASSERT(0);
  358. fallthrough;
  359. case XFS_AG_RESV_NONE:
  360. xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
  361. fallthrough;
  362. case XFS_AG_RESV_IGNORE:
  363. return;
  364. }
  365. leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
  366. resv->ar_reserved += leftover;
  367. if (type == XFS_AG_RESV_RMAPBT)
  368. return;
  369. /* Freeing into the reserved pool only requires on-disk update... */
  370. xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
  371. /* ...but freeing beyond that requires in-core and on-disk update. */
  372. if (len > leftover)
  373. xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
  374. }