xfs_log_cil.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
  4. */
  5. #include "xfs.h"
  6. #include "xfs_fs.h"
  7. #include "xfs_format.h"
  8. #include "xfs_log_format.h"
  9. #include "xfs_shared.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_error.h"
  13. #include "xfs_alloc.h"
  14. #include "xfs_extent_busy.h"
  15. #include "xfs_discard.h"
  16. #include "xfs_trans.h"
  17. #include "xfs_trans_priv.h"
  18. #include "xfs_log.h"
  19. #include "xfs_log_priv.h"
  20. #include "xfs_trace.h"
  21. struct workqueue_struct *xfs_discard_wq;
  22. /*
  23. * Allocate a new ticket. Failing to get a new ticket makes it really hard to
  24. * recover, so we don't allow failure here. Also, we allocate in a context that
  25. * we don't want to be issuing transactions from, so we need to tell the
  26. * allocation code this as well.
  27. *
  28. * We don't reserve any space for the ticket - we are going to steal whatever
  29. * space we require from transactions as they commit. To ensure we reserve all
  30. * the space required, we need to set the current reservation of the ticket to
  31. * zero so that we know to steal the initial transaction overhead from the
  32. * first transaction commit.
  33. */
  34. static struct xlog_ticket *
  35. xlog_cil_ticket_alloc(
  36. struct xlog *log)
  37. {
  38. struct xlog_ticket *tic;
  39. tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
  40. KM_SLEEP|KM_NOFS);
  41. /*
  42. * set the current reservation to zero so we know to steal the basic
  43. * transaction overhead reservation from the first transaction commit.
  44. */
  45. tic->t_curr_res = 0;
  46. return tic;
  47. }
  48. /*
  49. * After the first stage of log recovery is done, we know where the head and
  50. * tail of the log are. We need this log initialisation done before we can
  51. * initialise the first CIL checkpoint context.
  52. *
  53. * Here we allocate a log ticket to track space usage during a CIL push. This
  54. * ticket is passed to xlog_write() directly so that we don't slowly leak log
  55. * space by failing to account for space used by log headers and additional
  56. * region headers for split regions.
  57. */
  58. void
  59. xlog_cil_init_post_recovery(
  60. struct xlog *log)
  61. {
  62. log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
  63. log->l_cilp->xc_ctx->sequence = 1;
  64. }
  65. static inline int
  66. xlog_cil_iovec_space(
  67. uint niovecs)
  68. {
  69. return round_up((sizeof(struct xfs_log_vec) +
  70. niovecs * sizeof(struct xfs_log_iovec)),
  71. sizeof(uint64_t));
  72. }
  73. /*
  74. * Allocate or pin log vector buffers for CIL insertion.
  75. *
  76. * The CIL currently uses disposable buffers for copying a snapshot of the
  77. * modified items into the log during a push. The biggest problem with this is
  78. * the requirement to allocate the disposable buffer during the commit if:
  79. * a) does not exist; or
  80. * b) it is too small
  81. *
  82. * If we do this allocation within xlog_cil_insert_format_items(), it is done
  83. * under the xc_ctx_lock, which means that a CIL push cannot occur during
  84. * the memory allocation. This means that we have a potential deadlock situation
  85. * under low memory conditions when we have lots of dirty metadata pinned in
  86. * the CIL and we need a CIL commit to occur to free memory.
  87. *
  88. * To avoid this, we need to move the memory allocation outside the
  89. * xc_ctx_lock, but because the log vector buffers are disposable, that opens
  90. * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
  91. * vector buffers between the check and the formatting of the item into the
  92. * log vector buffer within the xc_ctx_lock.
  93. *
  94. * Because the log vector buffer needs to be unchanged during the CIL push
  95. * process, we cannot share the buffer between the transaction commit (which
  96. * modifies the buffer) and the CIL push context that is writing the changes
  97. * into the log. This means skipping preallocation of buffer space is
  98. * unreliable, but we most definitely do not want to be allocating and freeing
  99. * buffers unnecessarily during commits when overwrites can be done safely.
  100. *
  101. * The simplest solution to this problem is to allocate a shadow buffer when a
  102. * log item is committed for the second time, and then to only use this buffer
  103. * if necessary. The buffer can remain attached to the log item until such time
  104. * it is needed, and this is the buffer that is reallocated to match the size of
  105. * the incoming modification. Then during the formatting of the item we can swap
  106. * the active buffer with the new one if we can't reuse the existing buffer. We
  107. * don't free the old buffer as it may be reused on the next modification if
  108. * it's size is right, otherwise we'll free and reallocate it at that point.
  109. *
  110. * This function builds a vector for the changes in each log item in the
  111. * transaction. It then works out the length of the buffer needed for each log
  112. * item, allocates them and attaches the vector to the log item in preparation
  113. * for the formatting step which occurs under the xc_ctx_lock.
  114. *
  115. * While this means the memory footprint goes up, it avoids the repeated
  116. * alloc/free pattern that repeated modifications of an item would otherwise
  117. * cause, and hence minimises the CPU overhead of such behaviour.
  118. */
  119. static void
  120. xlog_cil_alloc_shadow_bufs(
  121. struct xlog *log,
  122. struct xfs_trans *tp)
  123. {
  124. struct xfs_log_item *lip;
  125. list_for_each_entry(lip, &tp->t_items, li_trans) {
  126. struct xfs_log_vec *lv;
  127. int niovecs = 0;
  128. int nbytes = 0;
  129. int buf_size;
  130. bool ordered = false;
  131. /* Skip items which aren't dirty in this transaction. */
  132. if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
  133. continue;
  134. /* get number of vecs and size of data to be stored */
  135. lip->li_ops->iop_size(lip, &niovecs, &nbytes);
  136. /*
  137. * Ordered items need to be tracked but we do not wish to write
  138. * them. We need a logvec to track the object, but we do not
  139. * need an iovec or buffer to be allocated for copying data.
  140. */
  141. if (niovecs == XFS_LOG_VEC_ORDERED) {
  142. ordered = true;
  143. niovecs = 0;
  144. nbytes = 0;
  145. }
  146. /*
  147. * We 64-bit align the length of each iovec so that the start
  148. * of the next one is naturally aligned. We'll need to
  149. * account for that slack space here. Then round nbytes up
  150. * to 64-bit alignment so that the initial buffer alignment is
  151. * easy to calculate and verify.
  152. */
  153. nbytes += niovecs * sizeof(uint64_t);
  154. nbytes = round_up(nbytes, sizeof(uint64_t));
  155. /*
  156. * The data buffer needs to start 64-bit aligned, so round up
  157. * that space to ensure we can align it appropriately and not
  158. * overrun the buffer.
  159. */
  160. buf_size = nbytes + xlog_cil_iovec_space(niovecs);
  161. /*
  162. * if we have no shadow buffer, or it is too small, we need to
  163. * reallocate it.
  164. */
  165. if (!lip->li_lv_shadow ||
  166. buf_size > lip->li_lv_shadow->lv_size) {
  167. /*
  168. * We free and allocate here as a realloc would copy
  169. * unecessary data. We don't use kmem_zalloc() for the
  170. * same reason - we don't need to zero the data area in
  171. * the buffer, only the log vector header and the iovec
  172. * storage.
  173. */
  174. kmem_free(lip->li_lv_shadow);
  175. lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
  176. memset(lv, 0, xlog_cil_iovec_space(niovecs));
  177. lv->lv_item = lip;
  178. lv->lv_size = buf_size;
  179. if (ordered)
  180. lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
  181. else
  182. lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
  183. lip->li_lv_shadow = lv;
  184. } else {
  185. /* same or smaller, optimise common overwrite case */
  186. lv = lip->li_lv_shadow;
  187. if (ordered)
  188. lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
  189. else
  190. lv->lv_buf_len = 0;
  191. lv->lv_bytes = 0;
  192. lv->lv_next = NULL;
  193. }
  194. /* Ensure the lv is set up according to ->iop_size */
  195. lv->lv_niovecs = niovecs;
  196. /* The allocated data region lies beyond the iovec region */
  197. lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
  198. }
  199. }
  200. /*
  201. * Prepare the log item for insertion into the CIL. Calculate the difference in
  202. * log space and vectors it will consume, and if it is a new item pin it as
  203. * well.
  204. */
  205. STATIC void
  206. xfs_cil_prepare_item(
  207. struct xlog *log,
  208. struct xfs_log_vec *lv,
  209. struct xfs_log_vec *old_lv,
  210. int *diff_len,
  211. int *diff_iovecs)
  212. {
  213. /* Account for the new LV being passed in */
  214. if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
  215. *diff_len += lv->lv_bytes;
  216. *diff_iovecs += lv->lv_niovecs;
  217. }
  218. /*
  219. * If there is no old LV, this is the first time we've seen the item in
  220. * this CIL context and so we need to pin it. If we are replacing the
  221. * old_lv, then remove the space it accounts for and make it the shadow
  222. * buffer for later freeing. In both cases we are now switching to the
  223. * shadow buffer, so update the the pointer to it appropriately.
  224. */
  225. if (!old_lv) {
  226. lv->lv_item->li_ops->iop_pin(lv->lv_item);
  227. lv->lv_item->li_lv_shadow = NULL;
  228. } else if (old_lv != lv) {
  229. ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
  230. *diff_len -= old_lv->lv_bytes;
  231. *diff_iovecs -= old_lv->lv_niovecs;
  232. lv->lv_item->li_lv_shadow = old_lv;
  233. }
  234. /* attach new log vector to log item */
  235. lv->lv_item->li_lv = lv;
  236. /*
  237. * If this is the first time the item is being committed to the
  238. * CIL, store the sequence number on the log item so we can
  239. * tell in future commits whether this is the first checkpoint
  240. * the item is being committed into.
  241. */
  242. if (!lv->lv_item->li_seq)
  243. lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
  244. }
  245. /*
  246. * Format log item into a flat buffers
  247. *
  248. * For delayed logging, we need to hold a formatted buffer containing all the
  249. * changes on the log item. This enables us to relog the item in memory and
  250. * write it out asynchronously without needing to relock the object that was
  251. * modified at the time it gets written into the iclog.
  252. *
  253. * This function takes the prepared log vectors attached to each log item, and
  254. * formats the changes into the log vector buffer. The buffer it uses is
  255. * dependent on the current state of the vector in the CIL - the shadow lv is
  256. * guaranteed to be large enough for the current modification, but we will only
  257. * use that if we can't reuse the existing lv. If we can't reuse the existing
  258. * lv, then simple swap it out for the shadow lv. We don't free it - that is
  259. * done lazily either by th enext modification or the freeing of the log item.
  260. *
  261. * We don't set up region headers during this process; we simply copy the
  262. * regions into the flat buffer. We can do this because we still have to do a
  263. * formatting step to write the regions into the iclog buffer. Writing the
  264. * ophdrs during the iclog write means that we can support splitting large
  265. * regions across iclog boundares without needing a change in the format of the
  266. * item/region encapsulation.
  267. *
  268. * Hence what we need to do now is change the rewrite the vector array to point
  269. * to the copied region inside the buffer we just allocated. This allows us to
  270. * format the regions into the iclog as though they are being formatted
  271. * directly out of the objects themselves.
  272. */
  273. static void
  274. xlog_cil_insert_format_items(
  275. struct xlog *log,
  276. struct xfs_trans *tp,
  277. int *diff_len,
  278. int *diff_iovecs)
  279. {
  280. struct xfs_log_item *lip;
  281. /* Bail out if we didn't find a log item. */
  282. if (list_empty(&tp->t_items)) {
  283. ASSERT(0);
  284. return;
  285. }
  286. list_for_each_entry(lip, &tp->t_items, li_trans) {
  287. struct xfs_log_vec *lv;
  288. struct xfs_log_vec *old_lv = NULL;
  289. struct xfs_log_vec *shadow;
  290. bool ordered = false;
  291. /* Skip items which aren't dirty in this transaction. */
  292. if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
  293. continue;
  294. /*
  295. * The formatting size information is already attached to
  296. * the shadow lv on the log item.
  297. */
  298. shadow = lip->li_lv_shadow;
  299. if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
  300. ordered = true;
  301. /* Skip items that do not have any vectors for writing */
  302. if (!shadow->lv_niovecs && !ordered)
  303. continue;
  304. /* compare to existing item size */
  305. old_lv = lip->li_lv;
  306. if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
  307. /* same or smaller, optimise common overwrite case */
  308. lv = lip->li_lv;
  309. lv->lv_next = NULL;
  310. if (ordered)
  311. goto insert;
  312. /*
  313. * set the item up as though it is a new insertion so
  314. * that the space reservation accounting is correct.
  315. */
  316. *diff_iovecs -= lv->lv_niovecs;
  317. *diff_len -= lv->lv_bytes;
  318. /* Ensure the lv is set up according to ->iop_size */
  319. lv->lv_niovecs = shadow->lv_niovecs;
  320. /* reset the lv buffer information for new formatting */
  321. lv->lv_buf_len = 0;
  322. lv->lv_bytes = 0;
  323. lv->lv_buf = (char *)lv +
  324. xlog_cil_iovec_space(lv->lv_niovecs);
  325. } else {
  326. /* switch to shadow buffer! */
  327. lv = shadow;
  328. lv->lv_item = lip;
  329. if (ordered) {
  330. /* track as an ordered logvec */
  331. ASSERT(lip->li_lv == NULL);
  332. goto insert;
  333. }
  334. }
  335. ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
  336. lip->li_ops->iop_format(lip, lv);
  337. insert:
  338. xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
  339. }
  340. }
  341. /*
  342. * Insert the log items into the CIL and calculate the difference in space
  343. * consumed by the item. Add the space to the checkpoint ticket and calculate
  344. * if the change requires additional log metadata. If it does, take that space
  345. * as well. Remove the amount of space we added to the checkpoint ticket from
  346. * the current transaction ticket so that the accounting works out correctly.
  347. */
  348. static void
  349. xlog_cil_insert_items(
  350. struct xlog *log,
  351. struct xfs_trans *tp)
  352. {
  353. struct xfs_cil *cil = log->l_cilp;
  354. struct xfs_cil_ctx *ctx = cil->xc_ctx;
  355. struct xfs_log_item *lip;
  356. int len = 0;
  357. int diff_iovecs = 0;
  358. int iclog_space;
  359. int iovhdr_res = 0, split_res = 0, ctx_res = 0;
  360. ASSERT(tp);
  361. /*
  362. * We can do this safely because the context can't checkpoint until we
  363. * are done so it doesn't matter exactly how we update the CIL.
  364. */
  365. xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
  366. spin_lock(&cil->xc_cil_lock);
  367. /* account for space used by new iovec headers */
  368. iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
  369. len += iovhdr_res;
  370. ctx->nvecs += diff_iovecs;
  371. /* attach the transaction to the CIL if it has any busy extents */
  372. if (!list_empty(&tp->t_busy))
  373. list_splice_init(&tp->t_busy, &ctx->busy_extents);
  374. /*
  375. * Now transfer enough transaction reservation to the context ticket
  376. * for the checkpoint. The context ticket is special - the unit
  377. * reservation has to grow as well as the current reservation as we
  378. * steal from tickets so we can correctly determine the space used
  379. * during the transaction commit.
  380. */
  381. if (ctx->ticket->t_curr_res == 0) {
  382. ctx_res = ctx->ticket->t_unit_res;
  383. ctx->ticket->t_curr_res = ctx_res;
  384. tp->t_ticket->t_curr_res -= ctx_res;
  385. }
  386. /* do we need space for more log record headers? */
  387. iclog_space = log->l_iclog_size - log->l_iclog_hsize;
  388. if (len > 0 && (ctx->space_used / iclog_space !=
  389. (ctx->space_used + len) / iclog_space)) {
  390. split_res = (len + iclog_space - 1) / iclog_space;
  391. /* need to take into account split region headers, too */
  392. split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
  393. ctx->ticket->t_unit_res += split_res;
  394. ctx->ticket->t_curr_res += split_res;
  395. tp->t_ticket->t_curr_res -= split_res;
  396. ASSERT(tp->t_ticket->t_curr_res >= len);
  397. }
  398. tp->t_ticket->t_curr_res -= len;
  399. ctx->space_used += len;
  400. /*
  401. * If we've overrun the reservation, dump the tx details before we move
  402. * the log items. Shutdown is imminent...
  403. */
  404. if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
  405. xfs_warn(log->l_mp, "Transaction log reservation overrun:");
  406. xfs_warn(log->l_mp,
  407. " log items: %d bytes (iov hdrs: %d bytes)",
  408. len, iovhdr_res);
  409. xfs_warn(log->l_mp, " split region headers: %d bytes",
  410. split_res);
  411. xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
  412. xlog_print_trans(tp);
  413. }
  414. /*
  415. * Now (re-)position everything modified at the tail of the CIL.
  416. * We do this here so we only need to take the CIL lock once during
  417. * the transaction commit.
  418. */
  419. list_for_each_entry(lip, &tp->t_items, li_trans) {
  420. /* Skip items which aren't dirty in this transaction. */
  421. if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
  422. continue;
  423. /*
  424. * Only move the item if it isn't already at the tail. This is
  425. * to prevent a transient list_empty() state when reinserting
  426. * an item that is already the only item in the CIL.
  427. */
  428. if (!list_is_last(&lip->li_cil, &cil->xc_cil))
  429. list_move_tail(&lip->li_cil, &cil->xc_cil);
  430. }
  431. spin_unlock(&cil->xc_cil_lock);
  432. if (tp->t_ticket->t_curr_res < 0)
  433. xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
  434. }
  435. static void
  436. xlog_cil_free_logvec(
  437. struct xfs_log_vec *log_vector)
  438. {
  439. struct xfs_log_vec *lv;
  440. for (lv = log_vector; lv; ) {
  441. struct xfs_log_vec *next = lv->lv_next;
  442. kmem_free(lv);
  443. lv = next;
  444. }
  445. }
  446. static void
  447. xlog_discard_endio_work(
  448. struct work_struct *work)
  449. {
  450. struct xfs_cil_ctx *ctx =
  451. container_of(work, struct xfs_cil_ctx, discard_endio_work);
  452. struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
  453. xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
  454. kmem_free(ctx);
  455. }
  456. /*
  457. * Queue up the actual completion to a thread to avoid IRQ-safe locking for
  458. * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
  459. * get the execution delayed up to 30 seconds for weird reasons.
  460. */
  461. static void
  462. xlog_discard_endio(
  463. struct bio *bio)
  464. {
  465. struct xfs_cil_ctx *ctx = bio->bi_private;
  466. INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
  467. queue_work(xfs_discard_wq, &ctx->discard_endio_work);
  468. bio_put(bio);
  469. }
  470. static void
  471. xlog_discard_busy_extents(
  472. struct xfs_mount *mp,
  473. struct xfs_cil_ctx *ctx)
  474. {
  475. struct list_head *list = &ctx->busy_extents;
  476. struct xfs_extent_busy *busyp;
  477. struct bio *bio = NULL;
  478. struct blk_plug plug;
  479. int error = 0;
  480. ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
  481. blk_start_plug(&plug);
  482. list_for_each_entry(busyp, list, list) {
  483. trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
  484. busyp->length);
  485. error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
  486. XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
  487. XFS_FSB_TO_BB(mp, busyp->length),
  488. GFP_NOFS, 0, &bio);
  489. if (error && error != -EOPNOTSUPP) {
  490. xfs_info(mp,
  491. "discard failed for extent [0x%llx,%u], error %d",
  492. (unsigned long long)busyp->bno,
  493. busyp->length,
  494. error);
  495. break;
  496. }
  497. }
  498. if (bio) {
  499. bio->bi_private = ctx;
  500. bio->bi_end_io = xlog_discard_endio;
  501. submit_bio(bio);
  502. } else {
  503. xlog_discard_endio_work(&ctx->discard_endio_work);
  504. }
  505. blk_finish_plug(&plug);
  506. }
  507. /*
  508. * Mark all items committed and clear busy extents. We free the log vector
  509. * chains in a separate pass so that we unpin the log items as quickly as
  510. * possible.
  511. */
  512. static void
  513. xlog_cil_committed(
  514. void *args,
  515. int abort)
  516. {
  517. struct xfs_cil_ctx *ctx = args;
  518. struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
  519. xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
  520. ctx->start_lsn, abort);
  521. xfs_extent_busy_sort(&ctx->busy_extents);
  522. xfs_extent_busy_clear(mp, &ctx->busy_extents,
  523. (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
  524. /*
  525. * If we are aborting the commit, wake up anyone waiting on the
  526. * committing list. If we don't, then a shutdown we can leave processes
  527. * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
  528. * will never happen because we aborted it.
  529. */
  530. spin_lock(&ctx->cil->xc_push_lock);
  531. if (abort)
  532. wake_up_all(&ctx->cil->xc_commit_wait);
  533. list_del(&ctx->committing);
  534. spin_unlock(&ctx->cil->xc_push_lock);
  535. xlog_cil_free_logvec(ctx->lv_chain);
  536. if (!list_empty(&ctx->busy_extents))
  537. xlog_discard_busy_extents(mp, ctx);
  538. else
  539. kmem_free(ctx);
  540. }
  541. /*
  542. * Push the Committed Item List to the log. If @push_seq flag is zero, then it
  543. * is a background flush and so we can chose to ignore it. Otherwise, if the
  544. * current sequence is the same as @push_seq we need to do a flush. If
  545. * @push_seq is less than the current sequence, then it has already been
  546. * flushed and we don't need to do anything - the caller will wait for it to
  547. * complete if necessary.
  548. *
  549. * @push_seq is a value rather than a flag because that allows us to do an
  550. * unlocked check of the sequence number for a match. Hence we can allows log
  551. * forces to run racily and not issue pushes for the same sequence twice. If we
  552. * get a race between multiple pushes for the same sequence they will block on
  553. * the first one and then abort, hence avoiding needless pushes.
  554. */
  555. STATIC int
  556. xlog_cil_push(
  557. struct xlog *log)
  558. {
  559. struct xfs_cil *cil = log->l_cilp;
  560. struct xfs_log_vec *lv;
  561. struct xfs_cil_ctx *ctx;
  562. struct xfs_cil_ctx *new_ctx;
  563. struct xlog_in_core *commit_iclog;
  564. struct xlog_ticket *tic;
  565. int num_iovecs;
  566. int error = 0;
  567. struct xfs_trans_header thdr;
  568. struct xfs_log_iovec lhdr;
  569. struct xfs_log_vec lvhdr = { NULL };
  570. xfs_lsn_t commit_lsn;
  571. xfs_lsn_t push_seq;
  572. if (!cil)
  573. return 0;
  574. new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
  575. new_ctx->ticket = xlog_cil_ticket_alloc(log);
  576. down_write(&cil->xc_ctx_lock);
  577. ctx = cil->xc_ctx;
  578. spin_lock(&cil->xc_push_lock);
  579. push_seq = cil->xc_push_seq;
  580. ASSERT(push_seq <= ctx->sequence);
  581. /*
  582. * Check if we've anything to push. If there is nothing, then we don't
  583. * move on to a new sequence number and so we have to be able to push
  584. * this sequence again later.
  585. */
  586. if (list_empty(&cil->xc_cil)) {
  587. cil->xc_push_seq = 0;
  588. spin_unlock(&cil->xc_push_lock);
  589. goto out_skip;
  590. }
  591. /* check for a previously pushed seqeunce */
  592. if (push_seq < cil->xc_ctx->sequence) {
  593. spin_unlock(&cil->xc_push_lock);
  594. goto out_skip;
  595. }
  596. /*
  597. * We are now going to push this context, so add it to the committing
  598. * list before we do anything else. This ensures that anyone waiting on
  599. * this push can easily detect the difference between a "push in
  600. * progress" and "CIL is empty, nothing to do".
  601. *
  602. * IOWs, a wait loop can now check for:
  603. * the current sequence not being found on the committing list;
  604. * an empty CIL; and
  605. * an unchanged sequence number
  606. * to detect a push that had nothing to do and therefore does not need
  607. * waiting on. If the CIL is not empty, we get put on the committing
  608. * list before emptying the CIL and bumping the sequence number. Hence
  609. * an empty CIL and an unchanged sequence number means we jumped out
  610. * above after doing nothing.
  611. *
  612. * Hence the waiter will either find the commit sequence on the
  613. * committing list or the sequence number will be unchanged and the CIL
  614. * still dirty. In that latter case, the push has not yet started, and
  615. * so the waiter will have to continue trying to check the CIL
  616. * committing list until it is found. In extreme cases of delay, the
  617. * sequence may fully commit between the attempts the wait makes to wait
  618. * on the commit sequence.
  619. */
  620. list_add(&ctx->committing, &cil->xc_committing);
  621. spin_unlock(&cil->xc_push_lock);
  622. /*
  623. * pull all the log vectors off the items in the CIL, and
  624. * remove the items from the CIL. We don't need the CIL lock
  625. * here because it's only needed on the transaction commit
  626. * side which is currently locked out by the flush lock.
  627. */
  628. lv = NULL;
  629. num_iovecs = 0;
  630. while (!list_empty(&cil->xc_cil)) {
  631. struct xfs_log_item *item;
  632. item = list_first_entry(&cil->xc_cil,
  633. struct xfs_log_item, li_cil);
  634. list_del_init(&item->li_cil);
  635. if (!ctx->lv_chain)
  636. ctx->lv_chain = item->li_lv;
  637. else
  638. lv->lv_next = item->li_lv;
  639. lv = item->li_lv;
  640. item->li_lv = NULL;
  641. num_iovecs += lv->lv_niovecs;
  642. }
  643. /*
  644. * initialise the new context and attach it to the CIL. Then attach
  645. * the current context to the CIL committing lsit so it can be found
  646. * during log forces to extract the commit lsn of the sequence that
  647. * needs to be forced.
  648. */
  649. INIT_LIST_HEAD(&new_ctx->committing);
  650. INIT_LIST_HEAD(&new_ctx->busy_extents);
  651. new_ctx->sequence = ctx->sequence + 1;
  652. new_ctx->cil = cil;
  653. cil->xc_ctx = new_ctx;
  654. /*
  655. * The switch is now done, so we can drop the context lock and move out
  656. * of a shared context. We can't just go straight to the commit record,
  657. * though - we need to synchronise with previous and future commits so
  658. * that the commit records are correctly ordered in the log to ensure
  659. * that we process items during log IO completion in the correct order.
  660. *
  661. * For example, if we get an EFI in one checkpoint and the EFD in the
  662. * next (e.g. due to log forces), we do not want the checkpoint with
  663. * the EFD to be committed before the checkpoint with the EFI. Hence
  664. * we must strictly order the commit records of the checkpoints so
  665. * that: a) the checkpoint callbacks are attached to the iclogs in the
  666. * correct order; and b) the checkpoints are replayed in correct order
  667. * in log recovery.
  668. *
  669. * Hence we need to add this context to the committing context list so
  670. * that higher sequences will wait for us to write out a commit record
  671. * before they do.
  672. *
  673. * xfs_log_force_lsn requires us to mirror the new sequence into the cil
  674. * structure atomically with the addition of this sequence to the
  675. * committing list. This also ensures that we can do unlocked checks
  676. * against the current sequence in log forces without risking
  677. * deferencing a freed context pointer.
  678. */
  679. spin_lock(&cil->xc_push_lock);
  680. cil->xc_current_sequence = new_ctx->sequence;
  681. spin_unlock(&cil->xc_push_lock);
  682. up_write(&cil->xc_ctx_lock);
  683. /*
  684. * Build a checkpoint transaction header and write it to the log to
  685. * begin the transaction. We need to account for the space used by the
  686. * transaction header here as it is not accounted for in xlog_write().
  687. *
  688. * The LSN we need to pass to the log items on transaction commit is
  689. * the LSN reported by the first log vector write. If we use the commit
  690. * record lsn then we can move the tail beyond the grant write head.
  691. */
  692. tic = ctx->ticket;
  693. thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
  694. thdr.th_type = XFS_TRANS_CHECKPOINT;
  695. thdr.th_tid = tic->t_tid;
  696. thdr.th_num_items = num_iovecs;
  697. lhdr.i_addr = &thdr;
  698. lhdr.i_len = sizeof(xfs_trans_header_t);
  699. lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
  700. tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
  701. lvhdr.lv_niovecs = 1;
  702. lvhdr.lv_iovecp = &lhdr;
  703. lvhdr.lv_next = ctx->lv_chain;
  704. error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
  705. if (error)
  706. goto out_abort_free_ticket;
  707. /*
  708. * now that we've written the checkpoint into the log, strictly
  709. * order the commit records so replay will get them in the right order.
  710. */
  711. restart:
  712. spin_lock(&cil->xc_push_lock);
  713. list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
  714. /*
  715. * Avoid getting stuck in this loop because we were woken by the
  716. * shutdown, but then went back to sleep once already in the
  717. * shutdown state.
  718. */
  719. if (XLOG_FORCED_SHUTDOWN(log)) {
  720. spin_unlock(&cil->xc_push_lock);
  721. goto out_abort_free_ticket;
  722. }
  723. /*
  724. * Higher sequences will wait for this one so skip them.
  725. * Don't wait for our own sequence, either.
  726. */
  727. if (new_ctx->sequence >= ctx->sequence)
  728. continue;
  729. if (!new_ctx->commit_lsn) {
  730. /*
  731. * It is still being pushed! Wait for the push to
  732. * complete, then start again from the beginning.
  733. */
  734. xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
  735. goto restart;
  736. }
  737. }
  738. spin_unlock(&cil->xc_push_lock);
  739. /* xfs_log_done always frees the ticket on error. */
  740. commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
  741. if (commit_lsn == -1)
  742. goto out_abort;
  743. /* attach all the transactions w/ busy extents to iclog */
  744. ctx->log_cb.cb_func = xlog_cil_committed;
  745. ctx->log_cb.cb_arg = ctx;
  746. error = xfs_log_notify(commit_iclog, &ctx->log_cb);
  747. if (error)
  748. goto out_abort;
  749. /*
  750. * now the checkpoint commit is complete and we've attached the
  751. * callbacks to the iclog we can assign the commit LSN to the context
  752. * and wake up anyone who is waiting for the commit to complete.
  753. */
  754. spin_lock(&cil->xc_push_lock);
  755. ctx->commit_lsn = commit_lsn;
  756. wake_up_all(&cil->xc_commit_wait);
  757. spin_unlock(&cil->xc_push_lock);
  758. /* release the hounds! */
  759. return xfs_log_release_iclog(log->l_mp, commit_iclog);
  760. out_skip:
  761. up_write(&cil->xc_ctx_lock);
  762. xfs_log_ticket_put(new_ctx->ticket);
  763. kmem_free(new_ctx);
  764. return 0;
  765. out_abort_free_ticket:
  766. xfs_log_ticket_put(tic);
  767. out_abort:
  768. xlog_cil_committed(ctx, XFS_LI_ABORTED);
  769. return -EIO;
  770. }
  771. static void
  772. xlog_cil_push_work(
  773. struct work_struct *work)
  774. {
  775. struct xfs_cil *cil = container_of(work, struct xfs_cil,
  776. xc_push_work);
  777. xlog_cil_push(cil->xc_log);
  778. }
  779. /*
  780. * We need to push CIL every so often so we don't cache more than we can fit in
  781. * the log. The limit really is that a checkpoint can't be more than half the
  782. * log (the current checkpoint is not allowed to overwrite the previous
  783. * checkpoint), but commit latency and memory usage limit this to a smaller
  784. * size.
  785. */
  786. static void
  787. xlog_cil_push_background(
  788. struct xlog *log)
  789. {
  790. struct xfs_cil *cil = log->l_cilp;
  791. /*
  792. * The cil won't be empty because we are called while holding the
  793. * context lock so whatever we added to the CIL will still be there
  794. */
  795. ASSERT(!list_empty(&cil->xc_cil));
  796. /*
  797. * don't do a background push if we haven't used up all the
  798. * space available yet.
  799. */
  800. if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
  801. return;
  802. spin_lock(&cil->xc_push_lock);
  803. if (cil->xc_push_seq < cil->xc_current_sequence) {
  804. cil->xc_push_seq = cil->xc_current_sequence;
  805. queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
  806. }
  807. spin_unlock(&cil->xc_push_lock);
  808. }
  809. /*
  810. * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
  811. * number that is passed. When it returns, the work will be queued for
  812. * @push_seq, but it won't be completed. The caller is expected to do any
  813. * waiting for push_seq to complete if it is required.
  814. */
  815. static void
  816. xlog_cil_push_now(
  817. struct xlog *log,
  818. xfs_lsn_t push_seq)
  819. {
  820. struct xfs_cil *cil = log->l_cilp;
  821. if (!cil)
  822. return;
  823. ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
  824. /* start on any pending background push to minimise wait time on it */
  825. flush_work(&cil->xc_push_work);
  826. /*
  827. * If the CIL is empty or we've already pushed the sequence then
  828. * there's no work we need to do.
  829. */
  830. spin_lock(&cil->xc_push_lock);
  831. if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
  832. spin_unlock(&cil->xc_push_lock);
  833. return;
  834. }
  835. cil->xc_push_seq = push_seq;
  836. queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
  837. spin_unlock(&cil->xc_push_lock);
  838. }
  839. bool
  840. xlog_cil_empty(
  841. struct xlog *log)
  842. {
  843. struct xfs_cil *cil = log->l_cilp;
  844. bool empty = false;
  845. spin_lock(&cil->xc_push_lock);
  846. if (list_empty(&cil->xc_cil))
  847. empty = true;
  848. spin_unlock(&cil->xc_push_lock);
  849. return empty;
  850. }
  851. /*
  852. * Commit a transaction with the given vector to the Committed Item List.
  853. *
  854. * To do this, we need to format the item, pin it in memory if required and
  855. * account for the space used by the transaction. Once we have done that we
  856. * need to release the unused reservation for the transaction, attach the
  857. * transaction to the checkpoint context so we carry the busy extents through
  858. * to checkpoint completion, and then unlock all the items in the transaction.
  859. *
  860. * Called with the context lock already held in read mode to lock out
  861. * background commit, returns without it held once background commits are
  862. * allowed again.
  863. */
  864. void
  865. xfs_log_commit_cil(
  866. struct xfs_mount *mp,
  867. struct xfs_trans *tp,
  868. xfs_lsn_t *commit_lsn,
  869. bool regrant)
  870. {
  871. struct xlog *log = mp->m_log;
  872. struct xfs_cil *cil = log->l_cilp;
  873. xfs_lsn_t xc_commit_lsn;
  874. /*
  875. * Do all necessary memory allocation before we lock the CIL.
  876. * This ensures the allocation does not deadlock with a CIL
  877. * push in memory reclaim (e.g. from kswapd).
  878. */
  879. xlog_cil_alloc_shadow_bufs(log, tp);
  880. /* lock out background commit */
  881. down_read(&cil->xc_ctx_lock);
  882. xlog_cil_insert_items(log, tp);
  883. xc_commit_lsn = cil->xc_ctx->sequence;
  884. if (commit_lsn)
  885. *commit_lsn = xc_commit_lsn;
  886. xfs_log_done(mp, tp->t_ticket, NULL, regrant);
  887. tp->t_ticket = NULL;
  888. xfs_trans_unreserve_and_mod_sb(tp);
  889. /*
  890. * Once all the items of the transaction have been copied to the CIL,
  891. * the items can be unlocked and freed.
  892. *
  893. * This needs to be done before we drop the CIL context lock because we
  894. * have to update state in the log items and unlock them before they go
  895. * to disk. If we don't, then the CIL checkpoint can race with us and
  896. * we can run checkpoint completion before we've updated and unlocked
  897. * the log items. This affects (at least) processing of stale buffers,
  898. * inodes and EFIs.
  899. */
  900. xfs_trans_free_items(tp, xc_commit_lsn, false);
  901. xlog_cil_push_background(log);
  902. up_read(&cil->xc_ctx_lock);
  903. }
  904. /*
  905. * Conditionally push the CIL based on the sequence passed in.
  906. *
  907. * We only need to push if we haven't already pushed the sequence
  908. * number given. Hence the only time we will trigger a push here is
  909. * if the push sequence is the same as the current context.
  910. *
  911. * We return the current commit lsn to allow the callers to determine if a
  912. * iclog flush is necessary following this call.
  913. */
  914. xfs_lsn_t
  915. xlog_cil_force_lsn(
  916. struct xlog *log,
  917. xfs_lsn_t sequence)
  918. {
  919. struct xfs_cil *cil = log->l_cilp;
  920. struct xfs_cil_ctx *ctx;
  921. xfs_lsn_t commit_lsn = NULLCOMMITLSN;
  922. ASSERT(sequence <= cil->xc_current_sequence);
  923. /*
  924. * check to see if we need to force out the current context.
  925. * xlog_cil_push() handles racing pushes for the same sequence,
  926. * so no need to deal with it here.
  927. */
  928. restart:
  929. xlog_cil_push_now(log, sequence);
  930. /*
  931. * See if we can find a previous sequence still committing.
  932. * We need to wait for all previous sequence commits to complete
  933. * before allowing the force of push_seq to go ahead. Hence block
  934. * on commits for those as well.
  935. */
  936. spin_lock(&cil->xc_push_lock);
  937. list_for_each_entry(ctx, &cil->xc_committing, committing) {
  938. /*
  939. * Avoid getting stuck in this loop because we were woken by the
  940. * shutdown, but then went back to sleep once already in the
  941. * shutdown state.
  942. */
  943. if (XLOG_FORCED_SHUTDOWN(log))
  944. goto out_shutdown;
  945. if (ctx->sequence > sequence)
  946. continue;
  947. if (!ctx->commit_lsn) {
  948. /*
  949. * It is still being pushed! Wait for the push to
  950. * complete, then start again from the beginning.
  951. */
  952. xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
  953. goto restart;
  954. }
  955. if (ctx->sequence != sequence)
  956. continue;
  957. /* found it! */
  958. commit_lsn = ctx->commit_lsn;
  959. }
  960. /*
  961. * The call to xlog_cil_push_now() executes the push in the background.
  962. * Hence by the time we have got here it our sequence may not have been
  963. * pushed yet. This is true if the current sequence still matches the
  964. * push sequence after the above wait loop and the CIL still contains
  965. * dirty objects. This is guaranteed by the push code first adding the
  966. * context to the committing list before emptying the CIL.
  967. *
  968. * Hence if we don't find the context in the committing list and the
  969. * current sequence number is unchanged then the CIL contents are
  970. * significant. If the CIL is empty, if means there was nothing to push
  971. * and that means there is nothing to wait for. If the CIL is not empty,
  972. * it means we haven't yet started the push, because if it had started
  973. * we would have found the context on the committing list.
  974. */
  975. if (sequence == cil->xc_current_sequence &&
  976. !list_empty(&cil->xc_cil)) {
  977. spin_unlock(&cil->xc_push_lock);
  978. goto restart;
  979. }
  980. spin_unlock(&cil->xc_push_lock);
  981. return commit_lsn;
  982. /*
  983. * We detected a shutdown in progress. We need to trigger the log force
  984. * to pass through it's iclog state machine error handling, even though
  985. * we are already in a shutdown state. Hence we can't return
  986. * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
  987. * LSN is already stable), so we return a zero LSN instead.
  988. */
  989. out_shutdown:
  990. spin_unlock(&cil->xc_push_lock);
  991. return 0;
  992. }
  993. /*
  994. * Check if the current log item was first committed in this sequence.
  995. * We can't rely on just the log item being in the CIL, we have to check
  996. * the recorded commit sequence number.
  997. *
  998. * Note: for this to be used in a non-racy manner, it has to be called with
  999. * CIL flushing locked out. As a result, it should only be used during the
  1000. * transaction commit process when deciding what to format into the item.
  1001. */
  1002. bool
  1003. xfs_log_item_in_current_chkpt(
  1004. struct xfs_log_item *lip)
  1005. {
  1006. struct xfs_cil_ctx *ctx;
  1007. if (list_empty(&lip->li_cil))
  1008. return false;
  1009. ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
  1010. /*
  1011. * li_seq is written on the first commit of a log item to record the
  1012. * first checkpoint it is written to. Hence if it is different to the
  1013. * current sequence, we're in a new checkpoint.
  1014. */
  1015. if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
  1016. return false;
  1017. return true;
  1018. }
  1019. /*
  1020. * Perform initial CIL structure initialisation.
  1021. */
  1022. int
  1023. xlog_cil_init(
  1024. struct xlog *log)
  1025. {
  1026. struct xfs_cil *cil;
  1027. struct xfs_cil_ctx *ctx;
  1028. cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
  1029. if (!cil)
  1030. return -ENOMEM;
  1031. ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
  1032. if (!ctx) {
  1033. kmem_free(cil);
  1034. return -ENOMEM;
  1035. }
  1036. INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
  1037. INIT_LIST_HEAD(&cil->xc_cil);
  1038. INIT_LIST_HEAD(&cil->xc_committing);
  1039. spin_lock_init(&cil->xc_cil_lock);
  1040. spin_lock_init(&cil->xc_push_lock);
  1041. init_rwsem(&cil->xc_ctx_lock);
  1042. init_waitqueue_head(&cil->xc_commit_wait);
  1043. INIT_LIST_HEAD(&ctx->committing);
  1044. INIT_LIST_HEAD(&ctx->busy_extents);
  1045. ctx->sequence = 1;
  1046. ctx->cil = cil;
  1047. cil->xc_ctx = ctx;
  1048. cil->xc_current_sequence = ctx->sequence;
  1049. cil->xc_log = log;
  1050. log->l_cilp = cil;
  1051. return 0;
  1052. }
  1053. void
  1054. xlog_cil_destroy(
  1055. struct xlog *log)
  1056. {
  1057. if (log->l_cilp->xc_ctx) {
  1058. if (log->l_cilp->xc_ctx->ticket)
  1059. xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
  1060. kmem_free(log->l_cilp->xc_ctx);
  1061. }
  1062. ASSERT(list_empty(&log->l_cilp->xc_cil));
  1063. kmem_free(log->l_cilp);
  1064. }