xfs_buf_item.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_bit.h"
  13. #include "xfs_mount.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_trans_priv.h"
  16. #include "xfs_buf_item.h"
  17. #include "xfs_inode.h"
  18. #include "xfs_inode_item.h"
  19. #include "xfs_quota.h"
  20. #include "xfs_dquot_item.h"
  21. #include "xfs_dquot.h"
  22. #include "xfs_trace.h"
  23. #include "xfs_log.h"
  24. #include "xfs_log_priv.h"
  25. #include "xfs_error.h"
  26. struct kmem_cache *xfs_buf_item_cache;
  27. static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
  28. {
  29. return container_of(lip, struct xfs_buf_log_item, bli_item);
  30. }
  31. /* Is this log iovec plausibly large enough to contain the buffer log format? */
  32. bool
  33. xfs_buf_log_check_iovec(
  34. struct xfs_log_iovec *iovec)
  35. {
  36. struct xfs_buf_log_format *blfp = iovec->i_addr;
  37. char *bmp_end;
  38. char *item_end;
  39. if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
  40. return false;
  41. item_end = (char *)iovec->i_addr + iovec->i_len;
  42. bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
  43. return bmp_end <= item_end;
  44. }
  45. static inline int
  46. xfs_buf_log_format_size(
  47. struct xfs_buf_log_format *blfp)
  48. {
  49. return offsetof(struct xfs_buf_log_format, blf_data_map) +
  50. (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
  51. }
  52. static inline bool
  53. xfs_buf_item_straddle(
  54. struct xfs_buf *bp,
  55. uint offset,
  56. int first_bit,
  57. int nbits)
  58. {
  59. void *first, *last;
  60. first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
  61. last = xfs_buf_offset(bp,
  62. offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
  63. if (last - first != nbits * XFS_BLF_CHUNK)
  64. return true;
  65. return false;
  66. }
  67. /*
  68. * Return the number of log iovecs and space needed to log the given buf log
  69. * item segment.
  70. *
  71. * It calculates this as 1 iovec for the buf log format structure and 1 for each
  72. * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
  73. * in a single iovec.
  74. */
  75. STATIC void
  76. xfs_buf_item_size_segment(
  77. struct xfs_buf_log_item *bip,
  78. struct xfs_buf_log_format *blfp,
  79. uint offset,
  80. int *nvecs,
  81. int *nbytes)
  82. {
  83. struct xfs_buf *bp = bip->bli_buf;
  84. int first_bit;
  85. int nbits;
  86. int next_bit;
  87. int last_bit;
  88. first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
  89. if (first_bit == -1)
  90. return;
  91. (*nvecs)++;
  92. *nbytes += xfs_buf_log_format_size(blfp);
  93. do {
  94. nbits = xfs_contig_bits(blfp->blf_data_map,
  95. blfp->blf_map_size, first_bit);
  96. ASSERT(nbits > 0);
  97. /*
  98. * Straddling a page is rare because we don't log contiguous
  99. * chunks of unmapped buffers anywhere.
  100. */
  101. if (nbits > 1 &&
  102. xfs_buf_item_straddle(bp, offset, first_bit, nbits))
  103. goto slow_scan;
  104. (*nvecs)++;
  105. *nbytes += nbits * XFS_BLF_CHUNK;
  106. /*
  107. * This takes the bit number to start looking from and
  108. * returns the next set bit from there. It returns -1
  109. * if there are no more bits set or the start bit is
  110. * beyond the end of the bitmap.
  111. */
  112. first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
  113. (uint)first_bit + nbits + 1);
  114. } while (first_bit != -1);
  115. return;
  116. slow_scan:
  117. /* Count the first bit we jumped out of the above loop from */
  118. (*nvecs)++;
  119. *nbytes += XFS_BLF_CHUNK;
  120. last_bit = first_bit;
  121. while (last_bit != -1) {
  122. /*
  123. * This takes the bit number to start looking from and
  124. * returns the next set bit from there. It returns -1
  125. * if there are no more bits set or the start bit is
  126. * beyond the end of the bitmap.
  127. */
  128. next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
  129. last_bit + 1);
  130. /*
  131. * If we run out of bits, leave the loop,
  132. * else if we find a new set of bits bump the number of vecs,
  133. * else keep scanning the current set of bits.
  134. */
  135. if (next_bit == -1) {
  136. break;
  137. } else if (next_bit != last_bit + 1 ||
  138. xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
  139. last_bit = next_bit;
  140. first_bit = next_bit;
  141. (*nvecs)++;
  142. nbits = 1;
  143. } else {
  144. last_bit++;
  145. nbits++;
  146. }
  147. *nbytes += XFS_BLF_CHUNK;
  148. }
  149. }
  150. /*
  151. * Return the number of log iovecs and space needed to log the given buf log
  152. * item.
  153. *
  154. * Discontiguous buffers need a format structure per region that is being
  155. * logged. This makes the changes in the buffer appear to log recovery as though
  156. * they came from separate buffers, just like would occur if multiple buffers
  157. * were used instead of a single discontiguous buffer. This enables
  158. * discontiguous buffers to be in-memory constructs, completely transparent to
  159. * what ends up on disk.
  160. *
  161. * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
  162. * format structures. If the item has previously been logged and has dirty
  163. * regions, we do not relog them in stale buffers. This has the effect of
  164. * reducing the size of the relogged item by the amount of dirty data tracked
  165. * by the log item. This can result in the committing transaction reducing the
  166. * amount of space being consumed by the CIL.
  167. */
  168. STATIC void
  169. xfs_buf_item_size(
  170. struct xfs_log_item *lip,
  171. int *nvecs,
  172. int *nbytes)
  173. {
  174. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  175. struct xfs_buf *bp = bip->bli_buf;
  176. int i;
  177. int bytes;
  178. uint offset = 0;
  179. ASSERT(atomic_read(&bip->bli_refcount) > 0);
  180. if (bip->bli_flags & XFS_BLI_STALE) {
  181. /*
  182. * The buffer is stale, so all we need to log is the buf log
  183. * format structure with the cancel flag in it as we are never
  184. * going to replay the changes tracked in the log item.
  185. */
  186. trace_xfs_buf_item_size_stale(bip);
  187. ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
  188. *nvecs += bip->bli_format_count;
  189. for (i = 0; i < bip->bli_format_count; i++) {
  190. *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
  191. }
  192. return;
  193. }
  194. ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
  195. if (bip->bli_flags & XFS_BLI_ORDERED) {
  196. /*
  197. * The buffer has been logged just to order it. It is not being
  198. * included in the transaction commit, so no vectors are used at
  199. * all.
  200. */
  201. trace_xfs_buf_item_size_ordered(bip);
  202. *nvecs = XFS_LOG_VEC_ORDERED;
  203. return;
  204. }
  205. /*
  206. * The vector count is based on the number of buffer vectors we have
  207. * dirty bits in. This will only be greater than one when we have a
  208. * compound buffer with more than one segment dirty. Hence for compound
  209. * buffers we need to track which segment the dirty bits correspond to,
  210. * and when we move from one segment to the next increment the vector
  211. * count for the extra buf log format structure that will need to be
  212. * written.
  213. */
  214. bytes = 0;
  215. for (i = 0; i < bip->bli_format_count; i++) {
  216. xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
  217. nvecs, &bytes);
  218. offset += BBTOB(bp->b_maps[i].bm_len);
  219. }
  220. /*
  221. * Round up the buffer size required to minimise the number of memory
  222. * allocations that need to be done as this item grows when relogged by
  223. * repeated modifications.
  224. */
  225. *nbytes = round_up(bytes, 512);
  226. trace_xfs_buf_item_size(bip);
  227. }
  228. static inline void
  229. xfs_buf_item_copy_iovec(
  230. struct xfs_log_vec *lv,
  231. struct xfs_log_iovec **vecp,
  232. struct xfs_buf *bp,
  233. uint offset,
  234. int first_bit,
  235. uint nbits)
  236. {
  237. offset += first_bit * XFS_BLF_CHUNK;
  238. xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
  239. xfs_buf_offset(bp, offset),
  240. nbits * XFS_BLF_CHUNK);
  241. }
  242. static void
  243. xfs_buf_item_format_segment(
  244. struct xfs_buf_log_item *bip,
  245. struct xfs_log_vec *lv,
  246. struct xfs_log_iovec **vecp,
  247. uint offset,
  248. struct xfs_buf_log_format *blfp)
  249. {
  250. struct xfs_buf *bp = bip->bli_buf;
  251. uint base_size;
  252. int first_bit;
  253. int last_bit;
  254. int next_bit;
  255. uint nbits;
  256. /* copy the flags across from the base format item */
  257. blfp->blf_flags = bip->__bli_format.blf_flags;
  258. /*
  259. * Base size is the actual size of the ondisk structure - it reflects
  260. * the actual size of the dirty bitmap rather than the size of the in
  261. * memory structure.
  262. */
  263. base_size = xfs_buf_log_format_size(blfp);
  264. first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
  265. if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
  266. /*
  267. * If the map is not be dirty in the transaction, mark
  268. * the size as zero and do not advance the vector pointer.
  269. */
  270. return;
  271. }
  272. blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
  273. blfp->blf_size = 1;
  274. if (bip->bli_flags & XFS_BLI_STALE) {
  275. /*
  276. * The buffer is stale, so all we need to log
  277. * is the buf log format structure with the
  278. * cancel flag in it.
  279. */
  280. trace_xfs_buf_item_format_stale(bip);
  281. ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
  282. return;
  283. }
  284. /*
  285. * Fill in an iovec for each set of contiguous chunks.
  286. */
  287. do {
  288. ASSERT(first_bit >= 0);
  289. nbits = xfs_contig_bits(blfp->blf_data_map,
  290. blfp->blf_map_size, first_bit);
  291. ASSERT(nbits > 0);
  292. /*
  293. * Straddling a page is rare because we don't log contiguous
  294. * chunks of unmapped buffers anywhere.
  295. */
  296. if (nbits > 1 &&
  297. xfs_buf_item_straddle(bp, offset, first_bit, nbits))
  298. goto slow_scan;
  299. xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
  300. first_bit, nbits);
  301. blfp->blf_size++;
  302. /*
  303. * This takes the bit number to start looking from and
  304. * returns the next set bit from there. It returns -1
  305. * if there are no more bits set or the start bit is
  306. * beyond the end of the bitmap.
  307. */
  308. first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
  309. (uint)first_bit + nbits + 1);
  310. } while (first_bit != -1);
  311. return;
  312. slow_scan:
  313. ASSERT(bp->b_addr == NULL);
  314. last_bit = first_bit;
  315. nbits = 1;
  316. for (;;) {
  317. /*
  318. * This takes the bit number to start looking from and
  319. * returns the next set bit from there. It returns -1
  320. * if there are no more bits set or the start bit is
  321. * beyond the end of the bitmap.
  322. */
  323. next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
  324. (uint)last_bit + 1);
  325. /*
  326. * If we run out of bits fill in the last iovec and get out of
  327. * the loop. Else if we start a new set of bits then fill in
  328. * the iovec for the series we were looking at and start
  329. * counting the bits in the new one. Else we're still in the
  330. * same set of bits so just keep counting and scanning.
  331. */
  332. if (next_bit == -1) {
  333. xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
  334. first_bit, nbits);
  335. blfp->blf_size++;
  336. break;
  337. } else if (next_bit != last_bit + 1 ||
  338. xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
  339. xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
  340. first_bit, nbits);
  341. blfp->blf_size++;
  342. first_bit = next_bit;
  343. last_bit = next_bit;
  344. nbits = 1;
  345. } else {
  346. last_bit++;
  347. nbits++;
  348. }
  349. }
  350. }
  351. /*
  352. * This is called to fill in the vector of log iovecs for the
  353. * given log buf item. It fills the first entry with a buf log
  354. * format structure, and the rest point to contiguous chunks
  355. * within the buffer.
  356. */
  357. STATIC void
  358. xfs_buf_item_format(
  359. struct xfs_log_item *lip,
  360. struct xfs_log_vec *lv)
  361. {
  362. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  363. struct xfs_buf *bp = bip->bli_buf;
  364. struct xfs_log_iovec *vecp = NULL;
  365. uint offset = 0;
  366. int i;
  367. ASSERT(atomic_read(&bip->bli_refcount) > 0);
  368. ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
  369. (bip->bli_flags & XFS_BLI_STALE));
  370. ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
  371. (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
  372. && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
  373. ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
  374. (bip->bli_flags & XFS_BLI_STALE));
  375. /*
  376. * If it is an inode buffer, transfer the in-memory state to the
  377. * format flags and clear the in-memory state.
  378. *
  379. * For buffer based inode allocation, we do not transfer
  380. * this state if the inode buffer allocation has not yet been committed
  381. * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
  382. * correct replay of the inode allocation.
  383. *
  384. * For icreate item based inode allocation, the buffers aren't written
  385. * to the journal during allocation, and hence we should always tag the
  386. * buffer as an inode buffer so that the correct unlinked list replay
  387. * occurs during recovery.
  388. */
  389. if (bip->bli_flags & XFS_BLI_INODE_BUF) {
  390. if (xfs_has_v3inodes(lip->li_log->l_mp) ||
  391. !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
  392. xfs_log_item_in_current_chkpt(lip)))
  393. bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
  394. bip->bli_flags &= ~XFS_BLI_INODE_BUF;
  395. }
  396. for (i = 0; i < bip->bli_format_count; i++) {
  397. xfs_buf_item_format_segment(bip, lv, &vecp, offset,
  398. &bip->bli_formats[i]);
  399. offset += BBTOB(bp->b_maps[i].bm_len);
  400. }
  401. /*
  402. * Check to make sure everything is consistent.
  403. */
  404. trace_xfs_buf_item_format(bip);
  405. }
  406. /*
  407. * This is called to pin the buffer associated with the buf log item in memory
  408. * so it cannot be written out.
  409. *
  410. * We take a reference to the buffer log item here so that the BLI life cycle
  411. * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
  412. * inserted into the AIL.
  413. *
  414. * We also need to take a reference to the buffer itself as the BLI unpin
  415. * processing requires accessing the buffer after the BLI has dropped the final
  416. * BLI reference. See xfs_buf_item_unpin() for an explanation.
  417. * If unpins race to drop the final BLI reference and only the
  418. * BLI owns a reference to the buffer, then the loser of the race can have the
  419. * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
  420. * pin count ensures the life cycle of the buffer extends for as
  421. * long as we hold the buffer pin reference in xfs_buf_item_unpin().
  422. */
  423. STATIC void
  424. xfs_buf_item_pin(
  425. struct xfs_log_item *lip)
  426. {
  427. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  428. ASSERT(atomic_read(&bip->bli_refcount) > 0);
  429. ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
  430. (bip->bli_flags & XFS_BLI_ORDERED) ||
  431. (bip->bli_flags & XFS_BLI_STALE));
  432. trace_xfs_buf_item_pin(bip);
  433. xfs_buf_hold(bip->bli_buf);
  434. atomic_inc(&bip->bli_refcount);
  435. atomic_inc(&bip->bli_buf->b_pin_count);
  436. }
  437. /*
  438. * This is called to unpin the buffer associated with the buf log item which was
  439. * previously pinned with a call to xfs_buf_item_pin(). We enter this function
  440. * with a buffer pin count, a buffer reference and a BLI reference.
  441. *
  442. * We must drop the BLI reference before we unpin the buffer because the AIL
  443. * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
  444. * refcount drops to zero, the bli could still be AIL resident and the buffer
  445. * submitted for I/O at any point before we return. This can result in IO
  446. * completion freeing the buffer while we are still trying to access it here.
  447. * This race condition can also occur in shutdown situations where we abort and
  448. * unpin buffers from contexts other that journal IO completion.
  449. *
  450. * Hence we have to hold a buffer reference per pin count to ensure that the
  451. * buffer cannot be freed until we have finished processing the unpin operation.
  452. * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
  453. * are done processing the buffer state. In the case of an abort (remove =
  454. * true) then we re-use the current pin reference as the IO reference we hand
  455. * off to IO failure handling.
  456. */
  457. STATIC void
  458. xfs_buf_item_unpin(
  459. struct xfs_log_item *lip,
  460. int remove)
  461. {
  462. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  463. struct xfs_buf *bp = bip->bli_buf;
  464. int stale = bip->bli_flags & XFS_BLI_STALE;
  465. int freed;
  466. ASSERT(bp->b_log_item == bip);
  467. ASSERT(atomic_read(&bip->bli_refcount) > 0);
  468. trace_xfs_buf_item_unpin(bip);
  469. freed = atomic_dec_and_test(&bip->bli_refcount);
  470. if (atomic_dec_and_test(&bp->b_pin_count))
  471. wake_up_all(&bp->b_waiters);
  472. /*
  473. * Nothing to do but drop the buffer pin reference if the BLI is
  474. * still active.
  475. */
  476. if (!freed) {
  477. xfs_buf_rele(bp);
  478. return;
  479. }
  480. if (stale) {
  481. ASSERT(bip->bli_flags & XFS_BLI_STALE);
  482. ASSERT(xfs_buf_islocked(bp));
  483. ASSERT(bp->b_flags & XBF_STALE);
  484. ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
  485. ASSERT(list_empty(&lip->li_trans));
  486. ASSERT(!bp->b_transp);
  487. trace_xfs_buf_item_unpin_stale(bip);
  488. /*
  489. * The buffer has been locked and referenced since it was marked
  490. * stale so we own both lock and reference exclusively here. We
  491. * do not need the pin reference any more, so drop it now so
  492. * that we only have one reference to drop once item completion
  493. * processing is complete.
  494. */
  495. xfs_buf_rele(bp);
  496. /*
  497. * If we get called here because of an IO error, we may or may
  498. * not have the item on the AIL. xfs_trans_ail_delete() will
  499. * take care of that situation. xfs_trans_ail_delete() drops
  500. * the AIL lock.
  501. */
  502. if (bip->bli_flags & XFS_BLI_STALE_INODE) {
  503. xfs_buf_item_done(bp);
  504. xfs_buf_inode_iodone(bp);
  505. ASSERT(list_empty(&bp->b_li_list));
  506. } else {
  507. xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
  508. xfs_buf_item_relse(bp);
  509. ASSERT(bp->b_log_item == NULL);
  510. }
  511. xfs_buf_relse(bp);
  512. return;
  513. }
  514. if (remove) {
  515. /*
  516. * We need to simulate an async IO failures here to ensure that
  517. * the correct error completion is run on this buffer. This
  518. * requires a reference to the buffer and for the buffer to be
  519. * locked. We can safely pass ownership of the pin reference to
  520. * the IO to ensure that nothing can free the buffer while we
  521. * wait for the lock and then run the IO failure completion.
  522. */
  523. xfs_buf_lock(bp);
  524. bp->b_flags |= XBF_ASYNC;
  525. xfs_buf_ioend_fail(bp);
  526. return;
  527. }
  528. /*
  529. * BLI has no more active references - it will be moved to the AIL to
  530. * manage the remaining BLI/buffer life cycle. There is nothing left for
  531. * us to do here so drop the pin reference to the buffer.
  532. */
  533. xfs_buf_rele(bp);
  534. }
  535. STATIC uint
  536. xfs_buf_item_push(
  537. struct xfs_log_item *lip,
  538. struct list_head *buffer_list)
  539. {
  540. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  541. struct xfs_buf *bp = bip->bli_buf;
  542. uint rval = XFS_ITEM_SUCCESS;
  543. if (xfs_buf_ispinned(bp))
  544. return XFS_ITEM_PINNED;
  545. if (!xfs_buf_trylock(bp)) {
  546. /*
  547. * If we have just raced with a buffer being pinned and it has
  548. * been marked stale, we could end up stalling until someone else
  549. * issues a log force to unpin the stale buffer. Check for the
  550. * race condition here so xfsaild recognizes the buffer is pinned
  551. * and queues a log force to move it along.
  552. */
  553. if (xfs_buf_ispinned(bp))
  554. return XFS_ITEM_PINNED;
  555. return XFS_ITEM_LOCKED;
  556. }
  557. ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
  558. trace_xfs_buf_item_push(bip);
  559. /* has a previous flush failed due to IO errors? */
  560. if (bp->b_flags & XBF_WRITE_FAIL) {
  561. xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
  562. "Failing async write on buffer block 0x%llx. Retrying async write.",
  563. (long long)xfs_buf_daddr(bp));
  564. }
  565. if (!xfs_buf_delwri_queue(bp, buffer_list))
  566. rval = XFS_ITEM_FLUSHING;
  567. xfs_buf_unlock(bp);
  568. return rval;
  569. }
  570. /*
  571. * Drop the buffer log item refcount and take appropriate action. This helper
  572. * determines whether the bli must be freed or not, since a decrement to zero
  573. * does not necessarily mean the bli is unused.
  574. *
  575. * Return true if the bli is freed, false otherwise.
  576. */
  577. bool
  578. xfs_buf_item_put(
  579. struct xfs_buf_log_item *bip)
  580. {
  581. struct xfs_log_item *lip = &bip->bli_item;
  582. bool aborted;
  583. bool dirty;
  584. /* drop the bli ref and return if it wasn't the last one */
  585. if (!atomic_dec_and_test(&bip->bli_refcount))
  586. return false;
  587. /*
  588. * We dropped the last ref and must free the item if clean or aborted.
  589. * If the bli is dirty and non-aborted, the buffer was clean in the
  590. * transaction but still awaiting writeback from previous changes. In
  591. * that case, the bli is freed on buffer writeback completion.
  592. */
  593. aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
  594. xlog_is_shutdown(lip->li_log);
  595. dirty = bip->bli_flags & XFS_BLI_DIRTY;
  596. if (dirty && !aborted)
  597. return false;
  598. /*
  599. * The bli is aborted or clean. An aborted item may be in the AIL
  600. * regardless of dirty state. For example, consider an aborted
  601. * transaction that invalidated a dirty bli and cleared the dirty
  602. * state.
  603. */
  604. if (aborted)
  605. xfs_trans_ail_delete(lip, 0);
  606. xfs_buf_item_relse(bip->bli_buf);
  607. return true;
  608. }
  609. /*
  610. * Release the buffer associated with the buf log item. If there is no dirty
  611. * logged data associated with the buffer recorded in the buf log item, then
  612. * free the buf log item and remove the reference to it in the buffer.
  613. *
  614. * This call ignores the recursion count. It is only called when the buffer
  615. * should REALLY be unlocked, regardless of the recursion count.
  616. *
  617. * We unconditionally drop the transaction's reference to the log item. If the
  618. * item was logged, then another reference was taken when it was pinned, so we
  619. * can safely drop the transaction reference now. This also allows us to avoid
  620. * potential races with the unpin code freeing the bli by not referencing the
  621. * bli after we've dropped the reference count.
  622. *
  623. * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
  624. * if necessary but do not unlock the buffer. This is for support of
  625. * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
  626. * free the item.
  627. */
  628. STATIC void
  629. xfs_buf_item_release(
  630. struct xfs_log_item *lip)
  631. {
  632. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  633. struct xfs_buf *bp = bip->bli_buf;
  634. bool released;
  635. bool hold = bip->bli_flags & XFS_BLI_HOLD;
  636. bool stale = bip->bli_flags & XFS_BLI_STALE;
  637. #if defined(DEBUG) || defined(XFS_WARN)
  638. bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
  639. bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
  640. bool aborted = test_bit(XFS_LI_ABORTED,
  641. &lip->li_flags);
  642. #endif
  643. trace_xfs_buf_item_release(bip);
  644. /*
  645. * The bli dirty state should match whether the blf has logged segments
  646. * except for ordered buffers, where only the bli should be dirty.
  647. */
  648. ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
  649. (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
  650. ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
  651. /*
  652. * Clear the buffer's association with this transaction and
  653. * per-transaction state from the bli, which has been copied above.
  654. */
  655. bp->b_transp = NULL;
  656. bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
  657. /*
  658. * Unref the item and unlock the buffer unless held or stale. Stale
  659. * buffers remain locked until final unpin unless the bli is freed by
  660. * the unref call. The latter implies shutdown because buffer
  661. * invalidation dirties the bli and transaction.
  662. */
  663. released = xfs_buf_item_put(bip);
  664. if (hold || (stale && !released))
  665. return;
  666. ASSERT(!stale || aborted);
  667. xfs_buf_relse(bp);
  668. }
  669. STATIC void
  670. xfs_buf_item_committing(
  671. struct xfs_log_item *lip,
  672. xfs_csn_t seq)
  673. {
  674. return xfs_buf_item_release(lip);
  675. }
  676. /*
  677. * This is called to find out where the oldest active copy of the
  678. * buf log item in the on disk log resides now that the last log
  679. * write of it completed at the given lsn.
  680. * We always re-log all the dirty data in a buffer, so usually the
  681. * latest copy in the on disk log is the only one that matters. For
  682. * those cases we simply return the given lsn.
  683. *
  684. * The one exception to this is for buffers full of newly allocated
  685. * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
  686. * flag set, indicating that only the di_next_unlinked fields from the
  687. * inodes in the buffers will be replayed during recovery. If the
  688. * original newly allocated inode images have not yet been flushed
  689. * when the buffer is so relogged, then we need to make sure that we
  690. * keep the old images in the 'active' portion of the log. We do this
  691. * by returning the original lsn of that transaction here rather than
  692. * the current one.
  693. */
  694. STATIC xfs_lsn_t
  695. xfs_buf_item_committed(
  696. struct xfs_log_item *lip,
  697. xfs_lsn_t lsn)
  698. {
  699. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  700. trace_xfs_buf_item_committed(bip);
  701. if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
  702. return lip->li_lsn;
  703. return lsn;
  704. }
  705. #ifdef DEBUG_EXPENSIVE
  706. static int
  707. xfs_buf_item_precommit(
  708. struct xfs_trans *tp,
  709. struct xfs_log_item *lip)
  710. {
  711. struct xfs_buf_log_item *bip = BUF_ITEM(lip);
  712. struct xfs_buf *bp = bip->bli_buf;
  713. struct xfs_mount *mp = bp->b_mount;
  714. xfs_failaddr_t fa;
  715. if (!bp->b_ops || !bp->b_ops->verify_struct)
  716. return 0;
  717. if (bip->bli_flags & XFS_BLI_STALE)
  718. return 0;
  719. fa = bp->b_ops->verify_struct(bp);
  720. if (fa) {
  721. xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name,
  722. bp->b_addr, BBTOB(bp->b_length), fa);
  723. xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
  724. ASSERT(fa == NULL);
  725. }
  726. return 0;
  727. }
  728. #else
  729. # define xfs_buf_item_precommit NULL
  730. #endif
  731. static const struct xfs_item_ops xfs_buf_item_ops = {
  732. .iop_size = xfs_buf_item_size,
  733. .iop_precommit = xfs_buf_item_precommit,
  734. .iop_format = xfs_buf_item_format,
  735. .iop_pin = xfs_buf_item_pin,
  736. .iop_unpin = xfs_buf_item_unpin,
  737. .iop_release = xfs_buf_item_release,
  738. .iop_committing = xfs_buf_item_committing,
  739. .iop_committed = xfs_buf_item_committed,
  740. .iop_push = xfs_buf_item_push,
  741. };
  742. STATIC void
  743. xfs_buf_item_get_format(
  744. struct xfs_buf_log_item *bip,
  745. int count)
  746. {
  747. ASSERT(bip->bli_formats == NULL);
  748. bip->bli_format_count = count;
  749. if (count == 1) {
  750. bip->bli_formats = &bip->__bli_format;
  751. return;
  752. }
  753. bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
  754. GFP_KERNEL | __GFP_NOFAIL);
  755. }
  756. STATIC void
  757. xfs_buf_item_free_format(
  758. struct xfs_buf_log_item *bip)
  759. {
  760. if (bip->bli_formats != &bip->__bli_format) {
  761. kfree(bip->bli_formats);
  762. bip->bli_formats = NULL;
  763. }
  764. }
  765. /*
  766. * Allocate a new buf log item to go with the given buffer.
  767. * Set the buffer's b_log_item field to point to the new
  768. * buf log item.
  769. */
  770. int
  771. xfs_buf_item_init(
  772. struct xfs_buf *bp,
  773. struct xfs_mount *mp)
  774. {
  775. struct xfs_buf_log_item *bip = bp->b_log_item;
  776. int chunks;
  777. int map_size;
  778. int i;
  779. /*
  780. * Check to see if there is already a buf log item for
  781. * this buffer. If we do already have one, there is
  782. * nothing to do here so return.
  783. */
  784. ASSERT(bp->b_mount == mp);
  785. if (bip) {
  786. ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
  787. ASSERT(!bp->b_transp);
  788. ASSERT(bip->bli_buf == bp);
  789. return 0;
  790. }
  791. bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
  792. xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
  793. bip->bli_buf = bp;
  794. /*
  795. * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
  796. * can be divided into. Make sure not to truncate any pieces.
  797. * map_size is the size of the bitmap needed to describe the
  798. * chunks of the buffer.
  799. *
  800. * Discontiguous buffer support follows the layout of the underlying
  801. * buffer. This makes the implementation as simple as possible.
  802. */
  803. xfs_buf_item_get_format(bip, bp->b_map_count);
  804. for (i = 0; i < bip->bli_format_count; i++) {
  805. chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
  806. XFS_BLF_CHUNK);
  807. map_size = DIV_ROUND_UP(chunks, NBWORD);
  808. if (map_size > XFS_BLF_DATAMAP_SIZE) {
  809. kmem_cache_free(xfs_buf_item_cache, bip);
  810. xfs_err(mp,
  811. "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
  812. map_size,
  813. BBTOB(bp->b_maps[i].bm_len));
  814. return -EFSCORRUPTED;
  815. }
  816. bip->bli_formats[i].blf_type = XFS_LI_BUF;
  817. bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
  818. bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
  819. bip->bli_formats[i].blf_map_size = map_size;
  820. }
  821. bp->b_log_item = bip;
  822. xfs_buf_hold(bp);
  823. return 0;
  824. }
  825. /*
  826. * Mark bytes first through last inclusive as dirty in the buf
  827. * item's bitmap.
  828. */
  829. static void
  830. xfs_buf_item_log_segment(
  831. uint first,
  832. uint last,
  833. uint *map)
  834. {
  835. uint first_bit;
  836. uint last_bit;
  837. uint bits_to_set;
  838. uint bits_set;
  839. uint word_num;
  840. uint *wordp;
  841. uint bit;
  842. uint end_bit;
  843. uint mask;
  844. ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
  845. ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
  846. /*
  847. * Convert byte offsets to bit numbers.
  848. */
  849. first_bit = first >> XFS_BLF_SHIFT;
  850. last_bit = last >> XFS_BLF_SHIFT;
  851. /*
  852. * Calculate the total number of bits to be set.
  853. */
  854. bits_to_set = last_bit - first_bit + 1;
  855. /*
  856. * Get a pointer to the first word in the bitmap
  857. * to set a bit in.
  858. */
  859. word_num = first_bit >> BIT_TO_WORD_SHIFT;
  860. wordp = &map[word_num];
  861. /*
  862. * Calculate the starting bit in the first word.
  863. */
  864. bit = first_bit & (uint)(NBWORD - 1);
  865. /*
  866. * First set any bits in the first word of our range.
  867. * If it starts at bit 0 of the word, it will be
  868. * set below rather than here. That is what the variable
  869. * bit tells us. The variable bits_set tracks the number
  870. * of bits that have been set so far. End_bit is the number
  871. * of the last bit to be set in this word plus one.
  872. */
  873. if (bit) {
  874. end_bit = min(bit + bits_to_set, (uint)NBWORD);
  875. mask = ((1U << (end_bit - bit)) - 1) << bit;
  876. *wordp |= mask;
  877. wordp++;
  878. bits_set = end_bit - bit;
  879. } else {
  880. bits_set = 0;
  881. }
  882. /*
  883. * Now set bits a whole word at a time that are between
  884. * first_bit and last_bit.
  885. */
  886. while ((bits_to_set - bits_set) >= NBWORD) {
  887. *wordp = 0xffffffff;
  888. bits_set += NBWORD;
  889. wordp++;
  890. }
  891. /*
  892. * Finally, set any bits left to be set in one last partial word.
  893. */
  894. end_bit = bits_to_set - bits_set;
  895. if (end_bit) {
  896. mask = (1U << end_bit) - 1;
  897. *wordp |= mask;
  898. }
  899. }
  900. /*
  901. * Mark bytes first through last inclusive as dirty in the buf
  902. * item's bitmap.
  903. */
  904. void
  905. xfs_buf_item_log(
  906. struct xfs_buf_log_item *bip,
  907. uint first,
  908. uint last)
  909. {
  910. int i;
  911. uint start;
  912. uint end;
  913. struct xfs_buf *bp = bip->bli_buf;
  914. /*
  915. * walk each buffer segment and mark them dirty appropriately.
  916. */
  917. start = 0;
  918. for (i = 0; i < bip->bli_format_count; i++) {
  919. if (start > last)
  920. break;
  921. end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
  922. /* skip to the map that includes the first byte to log */
  923. if (first > end) {
  924. start += BBTOB(bp->b_maps[i].bm_len);
  925. continue;
  926. }
  927. /*
  928. * Trim the range to this segment and mark it in the bitmap.
  929. * Note that we must convert buffer offsets to segment relative
  930. * offsets (e.g., the first byte of each segment is byte 0 of
  931. * that segment).
  932. */
  933. if (first < start)
  934. first = start;
  935. if (end > last)
  936. end = last;
  937. xfs_buf_item_log_segment(first - start, end - start,
  938. &bip->bli_formats[i].blf_data_map[0]);
  939. start += BBTOB(bp->b_maps[i].bm_len);
  940. }
  941. }
  942. /*
  943. * Return true if the buffer has any ranges logged/dirtied by a transaction,
  944. * false otherwise.
  945. */
  946. bool
  947. xfs_buf_item_dirty_format(
  948. struct xfs_buf_log_item *bip)
  949. {
  950. int i;
  951. for (i = 0; i < bip->bli_format_count; i++) {
  952. if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
  953. bip->bli_formats[i].blf_map_size))
  954. return true;
  955. }
  956. return false;
  957. }
  958. STATIC void
  959. xfs_buf_item_free(
  960. struct xfs_buf_log_item *bip)
  961. {
  962. xfs_buf_item_free_format(bip);
  963. kvfree(bip->bli_item.li_lv_shadow);
  964. kmem_cache_free(xfs_buf_item_cache, bip);
  965. }
  966. /*
  967. * xfs_buf_item_relse() is called when the buf log item is no longer needed.
  968. */
  969. void
  970. xfs_buf_item_relse(
  971. struct xfs_buf *bp)
  972. {
  973. struct xfs_buf_log_item *bip = bp->b_log_item;
  974. trace_xfs_buf_item_relse(bp, _RET_IP_);
  975. ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
  976. if (atomic_read(&bip->bli_refcount))
  977. return;
  978. bp->b_log_item = NULL;
  979. xfs_buf_rele(bp);
  980. xfs_buf_item_free(bip);
  981. }
  982. void
  983. xfs_buf_item_done(
  984. struct xfs_buf *bp)
  985. {
  986. /*
  987. * If we are forcibly shutting down, this may well be off the AIL
  988. * already. That's because we simulate the log-committed callbacks to
  989. * unpin these buffers. Or we may never have put this item on AIL
  990. * because of the transaction was aborted forcibly.
  991. * xfs_trans_ail_delete() takes care of these.
  992. *
  993. * Either way, AIL is useless if we're forcing a shutdown.
  994. *
  995. * Note that log recovery writes might have buffer items that are not on
  996. * the AIL even when the file system is not shut down.
  997. */
  998. xfs_trans_ail_delete(&bp->b_log_item->bli_item,
  999. (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
  1000. SHUTDOWN_CORRUPT_INCORE);
  1001. xfs_buf_item_relse(bp);
  1002. }