xfs_notify_failure.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2022 Fujitsu. All Rights Reserved.
  4. */
  5. #include "xfs.h"
  6. #include "xfs_shared.h"
  7. #include "xfs_format.h"
  8. #include "xfs_log_format.h"
  9. #include "xfs_trans_resv.h"
  10. #include "xfs_mount.h"
  11. #include "xfs_alloc.h"
  12. #include "xfs_bit.h"
  13. #include "xfs_btree.h"
  14. #include "xfs_inode.h"
  15. #include "xfs_icache.h"
  16. #include "xfs_rmap.h"
  17. #include "xfs_rmap_btree.h"
  18. #include "xfs_rtalloc.h"
  19. #include "xfs_trans.h"
  20. #include "xfs_ag.h"
  21. #include <linux/mm.h>
  22. #include <linux/dax.h>
  23. #include <linux/fs.h>
  24. struct xfs_failure_info {
  25. xfs_agblock_t startblock;
  26. xfs_extlen_t blockcount;
  27. int mf_flags;
  28. bool want_shutdown;
  29. };
  30. static pgoff_t
  31. xfs_failure_pgoff(
  32. struct xfs_mount *mp,
  33. const struct xfs_rmap_irec *rec,
  34. const struct xfs_failure_info *notify)
  35. {
  36. loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
  37. if (notify->startblock > rec->rm_startblock)
  38. pos += XFS_FSB_TO_B(mp,
  39. notify->startblock - rec->rm_startblock);
  40. return pos >> PAGE_SHIFT;
  41. }
  42. static unsigned long
  43. xfs_failure_pgcnt(
  44. struct xfs_mount *mp,
  45. const struct xfs_rmap_irec *rec,
  46. const struct xfs_failure_info *notify)
  47. {
  48. xfs_agblock_t end_rec;
  49. xfs_agblock_t end_notify;
  50. xfs_agblock_t start_cross;
  51. xfs_agblock_t end_cross;
  52. start_cross = max(rec->rm_startblock, notify->startblock);
  53. end_rec = rec->rm_startblock + rec->rm_blockcount;
  54. end_notify = notify->startblock + notify->blockcount;
  55. end_cross = min(end_rec, end_notify);
  56. return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
  57. }
  58. static int
  59. xfs_dax_failure_fn(
  60. struct xfs_btree_cur *cur,
  61. const struct xfs_rmap_irec *rec,
  62. void *data)
  63. {
  64. struct xfs_mount *mp = cur->bc_mp;
  65. struct xfs_inode *ip;
  66. struct xfs_failure_info *notify = data;
  67. struct address_space *mapping;
  68. pgoff_t pgoff;
  69. unsigned long pgcnt;
  70. int error = 0;
  71. if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
  72. (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
  73. /* Continue the query because this isn't a failure. */
  74. if (notify->mf_flags & MF_MEM_PRE_REMOVE)
  75. return 0;
  76. notify->want_shutdown = true;
  77. return 0;
  78. }
  79. /* Get files that incore, filter out others that are not in use. */
  80. error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
  81. 0, &ip);
  82. /* Continue the rmap query if the inode isn't incore */
  83. if (error == -ENODATA)
  84. return 0;
  85. if (error) {
  86. notify->want_shutdown = true;
  87. return 0;
  88. }
  89. mapping = VFS_I(ip)->i_mapping;
  90. pgoff = xfs_failure_pgoff(mp, rec, notify);
  91. pgcnt = xfs_failure_pgcnt(mp, rec, notify);
  92. /* Continue the rmap query if the inode isn't a dax file. */
  93. if (dax_mapping(mapping))
  94. error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
  95. notify->mf_flags);
  96. /* Invalidate the cache in dax pages. */
  97. if (notify->mf_flags & MF_MEM_PRE_REMOVE)
  98. invalidate_inode_pages2_range(mapping, pgoff,
  99. pgoff + pgcnt - 1);
  100. xfs_irele(ip);
  101. return error;
  102. }
  103. static int
  104. xfs_dax_notify_failure_freeze(
  105. struct xfs_mount *mp)
  106. {
  107. struct super_block *sb = mp->m_super;
  108. int error;
  109. error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
  110. if (error)
  111. xfs_emerg(mp, "already frozen by kernel, err=%d", error);
  112. return error;
  113. }
  114. static void
  115. xfs_dax_notify_failure_thaw(
  116. struct xfs_mount *mp,
  117. bool kernel_frozen)
  118. {
  119. struct super_block *sb = mp->m_super;
  120. int error;
  121. if (kernel_frozen) {
  122. error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
  123. if (error)
  124. xfs_emerg(mp, "still frozen after notify failure, err=%d",
  125. error);
  126. }
  127. /*
  128. * Also thaw userspace call anyway because the device is about to be
  129. * removed immediately.
  130. */
  131. thaw_super(sb, FREEZE_HOLDER_USERSPACE);
  132. }
  133. static int
  134. xfs_dax_translate_range(
  135. struct xfs_buftarg *btp,
  136. u64 offset,
  137. u64 len,
  138. xfs_daddr_t *daddr,
  139. uint64_t *bblen)
  140. {
  141. u64 dev_start = btp->bt_dax_part_off;
  142. u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
  143. u64 dev_end = dev_start + dev_len - 1;
  144. /* Notify failure on the whole device. */
  145. if (offset == 0 && len == U64_MAX) {
  146. offset = dev_start;
  147. len = dev_len;
  148. }
  149. /* Ignore the range out of filesystem area */
  150. if (offset + len - 1 < dev_start)
  151. return -ENXIO;
  152. if (offset > dev_end)
  153. return -ENXIO;
  154. /* Calculate the real range when it touches the boundary */
  155. if (offset > dev_start)
  156. offset -= dev_start;
  157. else {
  158. len -= dev_start - offset;
  159. offset = 0;
  160. }
  161. if (offset + len - 1 > dev_end)
  162. len = dev_end - offset + 1;
  163. *daddr = BTOBB(offset);
  164. *bblen = BTOBB(len);
  165. return 0;
  166. }
  167. static int
  168. xfs_dax_notify_logdev_failure(
  169. struct xfs_mount *mp,
  170. u64 offset,
  171. u64 len,
  172. int mf_flags)
  173. {
  174. xfs_daddr_t daddr;
  175. uint64_t bblen;
  176. int error;
  177. /*
  178. * Return ENXIO instead of shutting down the filesystem if the failed
  179. * region is beyond the end of the log.
  180. */
  181. error = xfs_dax_translate_range(mp->m_logdev_targp,
  182. offset, len, &daddr, &bblen);
  183. if (error)
  184. return error;
  185. /*
  186. * In the pre-remove case the failure notification is attempting to
  187. * trigger a force unmount. The expectation is that the device is
  188. * still present, but its removal is in progress and can not be
  189. * cancelled, proceed with accessing the log device.
  190. */
  191. if (mf_flags & MF_MEM_PRE_REMOVE)
  192. return 0;
  193. xfs_err(mp, "ondisk log corrupt, shutting down fs!");
  194. xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
  195. return -EFSCORRUPTED;
  196. }
  197. static int
  198. xfs_dax_notify_ddev_failure(
  199. struct xfs_mount *mp,
  200. xfs_daddr_t daddr,
  201. xfs_daddr_t bblen,
  202. int mf_flags)
  203. {
  204. struct xfs_failure_info notify = { .mf_flags = mf_flags };
  205. struct xfs_trans *tp = NULL;
  206. struct xfs_btree_cur *cur = NULL;
  207. struct xfs_buf *agf_bp = NULL;
  208. int error = 0;
  209. bool kernel_frozen = false;
  210. xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
  211. xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
  212. xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
  213. daddr + bblen - 1);
  214. xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
  215. if (mf_flags & MF_MEM_PRE_REMOVE) {
  216. xfs_info(mp, "Device is about to be removed!");
  217. /*
  218. * Freeze fs to prevent new mappings from being created.
  219. * - Keep going on if others already hold the kernel forzen.
  220. * - Keep going on if other errors too because this device is
  221. * starting to fail.
  222. * - If kernel frozen state is hold successfully here, thaw it
  223. * here as well at the end.
  224. */
  225. kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
  226. }
  227. error = xfs_trans_alloc_empty(mp, &tp);
  228. if (error)
  229. goto out;
  230. for (; agno <= end_agno; agno++) {
  231. struct xfs_rmap_irec ri_low = { };
  232. struct xfs_rmap_irec ri_high;
  233. struct xfs_agf *agf;
  234. struct xfs_perag *pag;
  235. xfs_agblock_t range_agend;
  236. pag = xfs_perag_get(mp, agno);
  237. error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
  238. if (error) {
  239. xfs_perag_put(pag);
  240. break;
  241. }
  242. cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
  243. /*
  244. * Set the rmap range from ri_low to ri_high, which represents
  245. * a [start, end] where we looking for the files or metadata.
  246. */
  247. memset(&ri_high, 0xFF, sizeof(ri_high));
  248. ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
  249. if (agno == end_agno)
  250. ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
  251. agf = agf_bp->b_addr;
  252. range_agend = min(be32_to_cpu(agf->agf_length) - 1,
  253. ri_high.rm_startblock);
  254. notify.startblock = ri_low.rm_startblock;
  255. notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
  256. error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
  257. xfs_dax_failure_fn, &notify);
  258. xfs_btree_del_cursor(cur, error);
  259. xfs_trans_brelse(tp, agf_bp);
  260. xfs_perag_put(pag);
  261. if (error)
  262. break;
  263. fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
  264. }
  265. xfs_trans_cancel(tp);
  266. /*
  267. * Shutdown fs from a force umount in pre-remove case which won't fail,
  268. * so errors can be ignored. Otherwise, shutdown the filesystem with
  269. * CORRUPT flag if error occured or notify.want_shutdown was set during
  270. * RMAP querying.
  271. */
  272. if (mf_flags & MF_MEM_PRE_REMOVE)
  273. xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
  274. else if (error || notify.want_shutdown) {
  275. xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
  276. if (!error)
  277. error = -EFSCORRUPTED;
  278. }
  279. out:
  280. /* Thaw the fs if it has been frozen before. */
  281. if (mf_flags & MF_MEM_PRE_REMOVE)
  282. xfs_dax_notify_failure_thaw(mp, kernel_frozen);
  283. return error;
  284. }
  285. static int
  286. xfs_dax_notify_failure(
  287. struct dax_device *dax_dev,
  288. u64 offset,
  289. u64 len,
  290. int mf_flags)
  291. {
  292. struct xfs_mount *mp = dax_holder(dax_dev);
  293. xfs_daddr_t daddr;
  294. uint64_t bblen;
  295. int error;
  296. if (!(mp->m_super->s_flags & SB_BORN)) {
  297. xfs_warn(mp, "filesystem is not ready for notify_failure()!");
  298. return -EIO;
  299. }
  300. if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
  301. xfs_debug(mp,
  302. "notify_failure() not supported on realtime device!");
  303. return -EOPNOTSUPP;
  304. }
  305. if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
  306. mp->m_logdev_targp != mp->m_ddev_targp) {
  307. return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
  308. }
  309. if (!xfs_has_rmapbt(mp)) {
  310. xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
  311. return -EOPNOTSUPP;
  312. }
  313. error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
  314. &bblen);
  315. if (error)
  316. return error;
  317. return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
  318. }
  319. const struct dax_holder_operations xfs_dax_holder_operations = {
  320. .notify_failure = xfs_dax_notify_failure,
  321. };