RD_Software
/
ark1668ed-bsp


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
							// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
 */

#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_bit.h"
#include "xfs_btree.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_trans.h"
#include "xfs_ag.h"

#include <linux/mm.h>
#include <linux/dax.h>
#include <linux/fs.h>

struct xfs_failure_info {
	xfs_agblock_t		startblock;
	xfs_extlen_t		blockcount;
	int			mf_flags;
	bool			want_shutdown;
};

static pgoff_t
xfs_failure_pgoff(
	struct xfs_mount		*mp,
	const struct xfs_rmap_irec	*rec,
	const struct xfs_failure_info	*notify)
{
	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);

	if (notify->startblock > rec->rm_startblock)
		pos += XFS_FSB_TO_B(mp,
				notify->startblock - rec->rm_startblock);
	return pos >> PAGE_SHIFT;
}

static unsigned long
xfs_failure_pgcnt(
	struct xfs_mount		*mp,
	const struct xfs_rmap_irec	*rec,
	const struct xfs_failure_info	*notify)
{
	xfs_agblock_t			end_rec;
	xfs_agblock_t			end_notify;
	xfs_agblock_t			start_cross;
	xfs_agblock_t			end_cross;

	start_cross = max(rec->rm_startblock, notify->startblock);

	end_rec = rec->rm_startblock + rec->rm_blockcount;
	end_notify = notify->startblock + notify->blockcount;
	end_cross = min(end_rec, end_notify);

	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
}

static int
xfs_dax_failure_fn(
	struct xfs_btree_cur		*cur,
	const struct xfs_rmap_irec	*rec,
	void				*data)
{
	struct xfs_mount		*mp = cur->bc_mp;
	struct xfs_inode		*ip;
	struct xfs_failure_info		*notify = data;
	struct address_space		*mapping;
	pgoff_t				pgoff;
	unsigned long			pgcnt;
	int				error = 0;

	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
		/* Continue the query because this isn't a failure. */
		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
			return 0;
		notify->want_shutdown = true;
		return 0;
	}

	/* Get files that incore, filter out others that are not in use. */
	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
			 0, &ip);
	/* Continue the rmap query if the inode isn't incore */
	if (error == -ENODATA)
		return 0;
	if (error) {
		notify->want_shutdown = true;
		return 0;
	}

	mapping = VFS_I(ip)->i_mapping;
	pgoff = xfs_failure_pgoff(mp, rec, notify);
	pgcnt = xfs_failure_pgcnt(mp, rec, notify);

	/* Continue the rmap query if the inode isn't a dax file. */
	if (dax_mapping(mapping))
		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
					  notify->mf_flags);

	/* Invalidate the cache in dax pages. */
	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
		invalidate_inode_pages2_range(mapping, pgoff,
					      pgoff + pgcnt - 1);

	xfs_irele(ip);
	return error;
}

static int
xfs_dax_notify_failure_freeze(
	struct xfs_mount	*mp)
{
	struct super_block	*sb = mp->m_super;
	int			error;

	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
	if (error)
		xfs_emerg(mp, "already frozen by kernel, err=%d", error);

	return error;
}

static void
xfs_dax_notify_failure_thaw(
	struct xfs_mount	*mp,
	bool			kernel_frozen)
{
	struct super_block	*sb = mp->m_super;
	int			error;

	if (kernel_frozen) {
		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
		if (error)
			xfs_emerg(mp, "still frozen after notify failure, err=%d",
				error);
	}

	/*
	 * Also thaw userspace call anyway because the device is about to be
	 * removed immediately.
	 */
	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int
xfs_dax_translate_range(
	struct xfs_buftarg	*btp,
	u64			offset,
	u64			len,
	xfs_daddr_t		*daddr,
	uint64_t		*bblen)
{
	u64			dev_start = btp->bt_dax_part_off;
	u64			dev_len = bdev_nr_bytes(btp->bt_bdev);
	u64			dev_end = dev_start + dev_len - 1;

	/* Notify failure on the whole device. */
	if (offset == 0 && len == U64_MAX) {
		offset = dev_start;
		len = dev_len;
	}

	/* Ignore the range out of filesystem area */
	if (offset + len - 1 < dev_start)
		return -ENXIO;
	if (offset > dev_end)
		return -ENXIO;

	/* Calculate the real range when it touches the boundary */
	if (offset > dev_start)
		offset -= dev_start;
	else {
		len -= dev_start - offset;
		offset = 0;
	}
	if (offset + len - 1 > dev_end)
		len = dev_end - offset + 1;

	*daddr = BTOBB(offset);
	*bblen = BTOBB(len);
	return 0;
}

static int
xfs_dax_notify_logdev_failure(
	struct xfs_mount	*mp,
	u64			offset,
	u64			len,
	int			mf_flags)
{
	xfs_daddr_t		daddr;
	uint64_t		bblen;
	int			error;

	/*
	 * Return ENXIO instead of shutting down the filesystem if the failed
	 * region is beyond the end of the log.
	 */
	error = xfs_dax_translate_range(mp->m_logdev_targp,
			offset, len, &daddr, &bblen);
	if (error)
		return error;

	/*
	 * In the pre-remove case the failure notification is attempting to
	 * trigger a force unmount.  The expectation is that the device is
	 * still present, but its removal is in progress and can not be
	 * cancelled, proceed with accessing the log device.
	 */
	if (mf_flags & MF_MEM_PRE_REMOVE)
		return 0;

	xfs_err(mp, "ondisk log corrupt, shutting down fs!");
	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
	return -EFSCORRUPTED;
}

static int
xfs_dax_notify_ddev_failure(
	struct xfs_mount	*mp,
	xfs_daddr_t		daddr,
	xfs_daddr_t		bblen,
	int			mf_flags)
{
	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
	struct xfs_trans	*tp = NULL;
	struct xfs_btree_cur	*cur = NULL;
	struct xfs_buf		*agf_bp = NULL;
	int			error = 0;
	bool			kernel_frozen = false;
	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
							     daddr + bblen - 1);
	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);

	if (mf_flags & MF_MEM_PRE_REMOVE) {
		xfs_info(mp, "Device is about to be removed!");
		/*
		 * Freeze fs to prevent new mappings from being created.
		 * - Keep going on if others already hold the kernel forzen.
		 * - Keep going on if other errors too because this device is
		 *   starting to fail.
		 * - If kernel frozen state is hold successfully here, thaw it
		 *   here as well at the end.
		 */
		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
	}

	error = xfs_trans_alloc_empty(mp, &tp);
	if (error)
		goto out;

	for (; agno <= end_agno; agno++) {
		struct xfs_rmap_irec	ri_low = { };
		struct xfs_rmap_irec	ri_high;
		struct xfs_agf		*agf;
		struct xfs_perag	*pag;
		xfs_agblock_t		range_agend;

		pag = xfs_perag_get(mp, agno);
		error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
		if (error) {
			xfs_perag_put(pag);
			break;
		}

		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);

		/*
		 * Set the rmap range from ri_low to ri_high, which represents
		 * a [start, end] where we looking for the files or metadata.
		 */
		memset(&ri_high, 0xFF, sizeof(ri_high));
		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
		if (agno == end_agno)
			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);

		agf = agf_bp->b_addr;
		range_agend = min(be32_to_cpu(agf->agf_length) - 1,
				ri_high.rm_startblock);
		notify.startblock = ri_low.rm_startblock;
		notify.blockcount = range_agend + 1 - ri_low.rm_startblock;

		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
				xfs_dax_failure_fn, &notify);
		xfs_btree_del_cursor(cur, error);
		xfs_trans_brelse(tp, agf_bp);
		xfs_perag_put(pag);
		if (error)
			break;

		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
	}

	xfs_trans_cancel(tp);

	/*
	 * Shutdown fs from a force umount in pre-remove case which won't fail,
	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
	 * CORRUPT flag if error occured or notify.want_shutdown was set during
	 * RMAP querying.
	 */
	if (mf_flags & MF_MEM_PRE_REMOVE)
		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
	else if (error || notify.want_shutdown) {
		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
		if (!error)
			error = -EFSCORRUPTED;
	}

out:
	/* Thaw the fs if it has been frozen before. */
	if (mf_flags & MF_MEM_PRE_REMOVE)
		xfs_dax_notify_failure_thaw(mp, kernel_frozen);

	return error;
}

static int
xfs_dax_notify_failure(
	struct dax_device	*dax_dev,
	u64			offset,
	u64			len,
	int			mf_flags)
{
	struct xfs_mount	*mp = dax_holder(dax_dev);
	xfs_daddr_t		daddr;
	uint64_t		bblen;
	int			error;

	if (!(mp->m_super->s_flags & SB_BORN)) {
		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
		return -EIO;
	}

	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
		xfs_debug(mp,
			 "notify_failure() not supported on realtime device!");
		return -EOPNOTSUPP;
	}

	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
	    mp->m_logdev_targp != mp->m_ddev_targp) {
		return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
	}

	if (!xfs_has_rmapbt(mp)) {
		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
		return -EOPNOTSUPP;
	}

	error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
			&bblen);
	if (error)
		return error;

	return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
}

const struct dax_holder_operations xfs_dax_holder_operations = {
	.notify_failure		= xfs_dax_notify_failure,
};