| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * linux/fs/ext4/file.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/file.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext4 fs regular file handling primitives
- *
- * 64-bit file support on 64-bit platforms by Jakub Jelinek
- * (jj@sunsite.ms.mff.cuni.cz)
- */
- #include <linux/time.h>
- #include <linux/fs.h>
- #include <linux/iomap.h>
- #include <linux/mount.h>
- #include <linux/path.h>
- #include <linux/dax.h>
- #include <linux/quotaops.h>
- #include <linux/pagevec.h>
- #include <linux/uio.h>
- #include <linux/mman.h>
- #include <linux/backing-dev.h>
- #include "ext4.h"
- #include "ext4_jbd2.h"
- #include "xattr.h"
- #include "acl.h"
- #include "truncate.h"
- /*
- * Returns %true if the given DIO request should be attempted with DIO, or
- * %false if it should fall back to buffered I/O.
- *
- * DIO isn't well specified; when it's unsupported (either due to the request
- * being misaligned, or due to the file not supporting DIO at all), filesystems
- * either fall back to buffered I/O or return EINVAL. For files that don't use
- * any special features like encryption or verity, ext4 has traditionally
- * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
- * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
- *
- * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
- * traditionally falls back to buffered I/O.
- *
- * This function implements the traditional ext4 behavior in all these cases.
- */
- static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- u32 dio_align = ext4_dio_alignment(inode);
- if (dio_align == 0)
- return false;
- if (dio_align == 1)
- return true;
- return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
- }
- static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
- {
- ssize_t ret;
- struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock_shared(inode))
- return -EAGAIN;
- } else {
- inode_lock_shared(inode);
- }
- if (!ext4_should_use_dio(iocb, to)) {
- inode_unlock_shared(inode);
- /*
- * Fallback to buffered I/O if the operation being performed on
- * the inode is not supported by direct I/O. The IOCB_DIRECT
- * flag needs to be cleared here in order to ensure that the
- * direct I/O path within generic_file_read_iter() is not
- * taken.
- */
- iocb->ki_flags &= ~IOCB_DIRECT;
- return generic_file_read_iter(iocb, to);
- }
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
- inode_unlock_shared(inode);
- file_accessed(iocb->ki_filp);
- return ret;
- }
- #ifdef CONFIG_FS_DAX
- static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock_shared(inode))
- return -EAGAIN;
- } else {
- inode_lock_shared(inode);
- }
- /*
- * Recheck under inode lock - at this point we are sure it cannot
- * change anymore
- */
- if (!IS_DAX(inode)) {
- inode_unlock_shared(inode);
- /* Fallback to buffered IO in case we cannot support DAX */
- return generic_file_read_iter(iocb, to);
- }
- ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
- inode_unlock_shared(inode);
- file_accessed(iocb->ki_filp);
- return ret;
- }
- #endif
- static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
- if (!iov_iter_count(to))
- return 0; /* skip atime */
- #ifdef CONFIG_FS_DAX
- if (IS_DAX(inode))
- return ext4_dax_read_iter(iocb, to);
- #endif
- if (iocb->ki_flags & IOCB_DIRECT)
- return ext4_dio_read_iter(iocb, to);
- return generic_file_read_iter(iocb, to);
- }
- static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags)
- {
- struct inode *inode = file_inode(in);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
- return filemap_splice_read(in, ppos, pipe, len, flags);
- }
- /*
- * Called when an inode is released. Note that this is different
- * from ext4_file_open: open gets called at every open, but release
- * gets called only when /all/ the files are closed.
- */
- static int ext4_release_file(struct inode *inode, struct file *filp)
- {
- if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
- ext4_alloc_da_blocks(inode);
- ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- }
- /* if we are the last writer on the inode, drop the block reservation */
- if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks) {
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
- up_write(&EXT4_I(inode)->i_data_sem);
- }
- if (is_dx(inode) && filp->private_data)
- ext4_htree_free_dir_info(filp->private_data);
- return 0;
- }
- /*
- * This tests whether the IO in question is block-aligned or not.
- * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
- * are converted to written only after the IO is complete. Until they are
- * mapped, these blocks appear as holes, so dio_zero_block() will assume that
- * it needs to zero out portions of the start and/or end block. If 2 AIO
- * threads are at work on the same unwritten block, they must be synchronized
- * or one thread will zero the other's data, causing corruption.
- */
- static bool
- ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
- {
- struct super_block *sb = inode->i_sb;
- unsigned long blockmask = sb->s_blocksize - 1;
- if ((pos | iov_iter_alignment(from)) & blockmask)
- return true;
- return false;
- }
- static bool
- ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
- {
- if (offset + len > i_size_read(inode) ||
- offset + len > EXT4_I(inode)->i_disksize)
- return true;
- return false;
- }
- /* Is IO overwriting allocated or initialized blocks? */
- static bool ext4_overwrite_io(struct inode *inode,
- loff_t pos, loff_t len, bool *unwritten)
- {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, blklen;
- if (pos + len > i_size_read(inode))
- return false;
- map.m_lblk = pos >> blkbits;
- map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
- blklen = map.m_len;
- err = ext4_map_blocks(NULL, inode, &map, 0);
- if (err != blklen)
- return false;
- /*
- * 'err==len' means that all of the blocks have been preallocated,
- * regardless of whether they have been initialized or not. We need to
- * check m_flags to distinguish the unwritten extents.
- */
- *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
- return true;
- }
- static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
- struct iov_iter *from)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- return ret;
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
- return -EFBIG;
- iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
- }
- return iov_iter_count(from);
- }
- static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
- {
- ssize_t ret, count;
- count = ext4_generic_write_checks(iocb, from);
- if (count <= 0)
- return count;
- ret = file_modified(iocb->ki_filp);
- if (ret)
- return ret;
- return count;
- }
- static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
- struct iov_iter *from)
- {
- ssize_t ret;
- struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
- inode_lock(inode);
- ret = ext4_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
- ret = generic_perform_write(iocb, from);
- out:
- inode_unlock(inode);
- if (unlikely(ret <= 0))
- return ret;
- return generic_write_sync(iocb, ret);
- }
- static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, ssize_t count)
- {
- handle_t *handle;
- lockdep_assert_held_write(&inode->i_rwsem);
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + written)) {
- int ret = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret)) {
- ext4_journal_stop(handle);
- return ret;
- }
- }
- if ((written == count) && inode->i_nlink)
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- return written;
- }
- /*
- * Clean up the inode after DIO or DAX extending write has completed and the
- * inode size has been updated using ext4_handle_inode_extension().
- */
- static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
- {
- lockdep_assert_held_write(&inode->i_rwsem);
- if (need_trunc) {
- ext4_truncate_failed_write(inode);
- /*
- * If the truncate operation failed early, then the inode may
- * still be on the orphan list. In that case, we need to try
- * remove the inode from the in-memory linked list.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- return;
- }
- /*
- * If i_disksize got extended either due to writeback of delalloc
- * blocks or extending truncate while the DIO was running we could fail
- * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
- * now.
- */
- if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) {
- handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /*
- * The write has successfully completed. Not much to
- * do with the error here so just cleanup the orphan
- * list and hope for the best.
- */
- ext4_orphan_del(NULL, inode);
- return;
- }
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
- }
- static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
- int error, unsigned int flags)
- {
- loff_t pos = iocb->ki_pos;
- struct inode *inode = file_inode(iocb->ki_filp);
- if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
- error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
- if (error)
- return error;
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended. Also we can race with truncate or write
- * expanding the file so we have to be a bit careful here.
- */
- if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
- pos + size <= i_size_read(inode))
- return size;
- return ext4_handle_inode_extension(inode, pos, size, size);
- }
- static const struct iomap_dio_ops ext4_dio_write_ops = {
- .end_io = ext4_dio_write_end_io,
- };
- /*
- * The intention here is to start with shared lock acquired then see if any
- * condition requires an exclusive inode lock. If yes, then we restart the
- * whole operation by releasing the shared lock and acquiring exclusive lock.
- *
- * - For unaligned_io we never take shared lock as it may cause data corruption
- * when two unaligned IO tries to modify the same block e.g. while zeroing.
- *
- * - For extending writes case we don't take the shared lock, since it requires
- * updating inode i_disksize and/or orphan handling with exclusive lock.
- *
- * - shared locking will only be true mostly with overwrites, including
- * initialized blocks and unwritten blocks. For overwrite unwritten blocks
- * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
- * also release exclusive i_rwsem lock.
- *
- * - Otherwise we will switch to exclusive i_rwsem lock.
- */
- static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
- bool *ilock_shared, bool *extend,
- bool *unwritten, int *dio_flags)
- {
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- loff_t offset;
- size_t count;
- ssize_t ret;
- bool overwrite, unaligned_io;
- restart:
- ret = ext4_generic_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
- offset = iocb->ki_pos;
- count = ret;
- unaligned_io = ext4_unaligned_io(inode, from, offset);
- *extend = ext4_extending_io(inode, offset, count);
- overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
- /*
- * Determine whether we need to upgrade to an exclusive lock. This is
- * required to change security info in file_modified(), for extending
- * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
- * extents (as partial block zeroing may be required).
- *
- * Note that unaligned writes are allowed under shared lock so long as
- * they are pure overwrites. Otherwise, concurrent unaligned writes risk
- * data corruption due to partial block zeroing in the dio layer, and so
- * the I/O must occur exclusively.
- */
- if (*ilock_shared &&
- ((!IS_NOSEC(inode) || *extend || !overwrite ||
- (unaligned_io && *unwritten)))) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- inode_unlock_shared(inode);
- *ilock_shared = false;
- inode_lock(inode);
- goto restart;
- }
- /*
- * Now that locking is settled, determine dio flags and exclusivity
- * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce
- * behavior already. The inode lock is already held exclusive if the
- * write is non-overwrite or extending, so drain all outstanding dio and
- * set the force wait dio flag.
- */
- if (!*ilock_shared && (unaligned_io || *extend)) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- if (unaligned_io && (!overwrite || *unwritten))
- inode_dio_wait(inode);
- *dio_flags = IOMAP_DIO_FORCE_WAIT;
- }
- ret = file_modified(file);
- if (ret < 0)
- goto out;
- return count;
- out:
- if (*ilock_shared)
- inode_unlock_shared(inode);
- else
- inode_unlock(inode);
- return ret;
- }
- static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
- {
- ssize_t ret;
- handle_t *handle;
- struct inode *inode = file_inode(iocb->ki_filp);
- loff_t offset = iocb->ki_pos;
- size_t count = iov_iter_count(from);
- const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unwritten = false;
- bool ilock_shared = true;
- int dio_flags = 0;
- /*
- * Quick check here without any i_rwsem lock to see if it is extending
- * IO. A more reliable check is done in ext4_dio_write_checks() with
- * proper locking in place.
- */
- if (offset + count > i_size_read(inode))
- ilock_shared = false;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (ilock_shared) {
- if (!inode_trylock_shared(inode))
- return -EAGAIN;
- } else {
- if (!inode_trylock(inode))
- return -EAGAIN;
- }
- } else {
- if (ilock_shared)
- inode_lock_shared(inode);
- else
- inode_lock(inode);
- }
- /* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_should_use_dio(iocb, from)) {
- if (ilock_shared)
- inode_unlock_shared(inode);
- else
- inode_unlock(inode);
- return ext4_buffered_write_iter(iocb, from);
- }
- /*
- * Prevent inline data from being created since we are going to allocate
- * blocks for DIO. We know the inode does not currently have inline data
- * because ext4_should_use_dio() checked for it, but we have to clear
- * the state flag before the write checks because a lock cycle could
- * introduce races with other writers.
- */
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
- &unwritten, &dio_flags);
- if (ret <= 0)
- return ret;
- offset = iocb->ki_pos;
- count = ret;
- if (extend) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- ext4_journal_stop(handle);
- }
- if (ilock_shared && !unwritten)
- iomap_ops = &ext4_iomap_overwrite_ops;
- ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- dio_flags, NULL, 0);
- if (ret == -ENOTBLK)
- ret = 0;
- if (extend) {
- /*
- * We always perform extending DIO write synchronously so by
- * now the IO is completed and ext4_handle_inode_extension()
- * was called. Cleanup the inode in case of error or race with
- * writeback of delalloc blocks.
- */
- WARN_ON_ONCE(ret == -EIOCBQUEUED);
- ext4_inode_extension_cleanup(inode, ret < 0);
- }
- out:
- if (ilock_shared)
- inode_unlock_shared(inode);
- else
- inode_unlock(inode);
- if (ret >= 0 && iov_iter_count(from)) {
- ssize_t err;
- loff_t endbyte;
- offset = iocb->ki_pos;
- err = ext4_buffered_write_iter(iocb, from);
- if (err < 0)
- return err;
- /*
- * We need to ensure that the pages within the page cache for
- * the range covered by this I/O are written to disk and
- * invalidated. This is in attempt to preserve the expected
- * direct I/O semantics in the case we fallback to buffered I/O
- * to complete off the I/O request.
- */
- ret += err;
- endbyte = offset + err - 1;
- err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
- offset, endbyte);
- if (!err)
- invalidate_mapping_pages(iocb->ki_filp->f_mapping,
- offset >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- }
- return ret;
- }
- #ifdef CONFIG_FS_DAX
- static ssize_t
- ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
- {
- ssize_t ret;
- size_t count;
- loff_t offset;
- handle_t *handle;
- bool extend = false;
- struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
- ret = ext4_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
- offset = iocb->ki_pos;
- count = iov_iter_count(from);
- if (offset + count > EXT4_I(inode)->i_disksize) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- extend = true;
- ext4_journal_stop(handle);
- }
- ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend) {
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
- ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
- }
- out:
- inode_unlock(inode);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
- return ret;
- }
- #endif
- static ssize_t
- ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
- #ifdef CONFIG_FS_DAX
- if (IS_DAX(inode))
- return ext4_dax_write_iter(iocb, from);
- #endif
- if (iocb->ki_flags & IOCB_DIRECT)
- return ext4_dio_write_iter(iocb, from);
- else
- return ext4_buffered_write_iter(iocb, from);
- }
- #ifdef CONFIG_FS_DAX
- static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
- {
- int error = 0;
- vm_fault_t result;
- int retries = 0;
- handle_t *handle = NULL;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- /*
- * We have to distinguish real writes from writes which will result in a
- * COW page; COW writes should *not* poke the journal (the file will not
- * be changed). Doing so would cause unintended failures when mounted
- * read-only.
- *
- * We check for VM_SHARED rather than vmf->cow_page since the latter is
- * unset for order != 0 (i.e. only in do_cow_fault); for
- * other sizes, dax_iomap_fault will handle splitting / fallback so that
- * we eventually come back with a COW page.
- */
- bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
- (vmf->vma->vm_flags & VM_SHARED);
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- pfn_t pfn;
- if (write) {
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- filemap_invalidate_lock_shared(mapping);
- retry:
- handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
- EXT4_DATA_TRANS_BLOCKS(sb));
- if (IS_ERR(handle)) {
- filemap_invalidate_unlock_shared(mapping);
- sb_end_pagefault(sb);
- return VM_FAULT_SIGBUS;
- }
- } else {
- filemap_invalidate_lock_shared(mapping);
- }
- result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
- if (write) {
- ext4_journal_stop(handle);
- if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
- ext4_should_retry_alloc(sb, &retries))
- goto retry;
- /* Handling synchronous page fault? */
- if (result & VM_FAULT_NEEDDSYNC)
- result = dax_finish_sync_fault(vmf, order, pfn);
- filemap_invalidate_unlock_shared(mapping);
- sb_end_pagefault(sb);
- } else {
- filemap_invalidate_unlock_shared(mapping);
- }
- return result;
- }
- static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
- {
- return ext4_dax_huge_fault(vmf, 0);
- }
- static const struct vm_operations_struct ext4_dax_vm_ops = {
- .fault = ext4_dax_fault,
- .huge_fault = ext4_dax_huge_fault,
- .page_mkwrite = ext4_dax_fault,
- .pfn_mkwrite = ext4_dax_fault,
- };
- #else
- #define ext4_dax_vm_ops ext4_file_vm_ops
- #endif
- static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = ext4_page_mkwrite,
- };
- static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
- {
- struct inode *inode = file->f_mapping->host;
- struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
- /*
- * We don't support synchronous mappings for non-DAX files and
- * for DAX files if underneath dax_device is not synchronous.
- */
- if (!daxdev_mapping_supported(vma, dax_dev))
- return -EOPNOTSUPP;
- file_accessed(file);
- if (IS_DAX(file_inode(file))) {
- vma->vm_ops = &ext4_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
- } else {
- vma->vm_ops = &ext4_file_vm_ops;
- }
- return 0;
- }
- static int ext4_sample_last_mounted(struct super_block *sb,
- struct vfsmount *mnt)
- {
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct path path;
- char buf[64], *cp;
- handle_t *handle;
- int err;
- if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
- return 0;
- if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
- return 0;
- ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
- /*
- * Sample where the filesystem has been mounted and
- * store it in the superblock for sysadmin convenience
- * when trying to sort through large numbers of block
- * devices or filesystem images.
- */
- memset(buf, 0, sizeof(buf));
- path.mnt = mnt;
- path.dentry = mnt->mnt_root;
- cp = d_path(&path, buf, sizeof(buf));
- err = 0;
- if (IS_ERR(cp))
- goto out;
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- err = PTR_ERR(handle);
- if (IS_ERR(handle))
- goto out;
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
- EXT4_JTR_NONE);
- if (err)
- goto out_journal;
- lock_buffer(sbi->s_sbh);
- strtomem_pad(sbi->s_es->s_last_mounted, cp, 0);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
- ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
- out_journal:
- ext4_journal_stop(handle);
- out:
- sb_end_intwrite(sb);
- return err;
- }
- static int ext4_file_open(struct inode *inode, struct file *filp)
- {
- int ret;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
- ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
- if (ret)
- return ret;
- ret = fscrypt_file_open(inode, filp);
- if (ret)
- return ret;
- ret = fsverity_file_open(inode, filp);
- if (ret)
- return ret;
- /*
- * Set up the jbd2_inode if we are opening the inode for
- * writing and the journal is present
- */
- if (filp->f_mode & FMODE_WRITE) {
- ret = ext4_inode_attach_jinode(inode);
- if (ret < 0)
- return ret;
- }
- filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
- return dquot_file_open(inode, filp);
- }
- /*
- * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
- * by calling generic_file_llseek_size() with the appropriate maxbytes
- * value for each.
- */
- loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
- {
- struct inode *inode = file->f_mapping->host;
- loff_t maxbytes = ext4_get_maxbytes(inode);
- switch (whence) {
- default:
- return generic_file_llseek_size(file, offset, whence,
- maxbytes, i_size_read(inode));
- case SEEK_HOLE:
- inode_lock_shared(inode);
- offset = iomap_seek_hole(inode, offset,
- &ext4_iomap_report_ops);
- inode_unlock_shared(inode);
- break;
- case SEEK_DATA:
- inode_lock_shared(inode);
- offset = iomap_seek_data(inode, offset,
- &ext4_iomap_report_ops);
- inode_unlock_shared(inode);
- break;
- }
- if (offset < 0)
- return offset;
- return vfs_setpos(file, offset, maxbytes);
- }
- const struct file_operations ext4_file_operations = {
- .llseek = ext4_llseek,
- .read_iter = ext4_file_read_iter,
- .write_iter = ext4_file_write_iter,
- .iopoll = iocb_bio_iopoll,
- .unlocked_ioctl = ext4_ioctl,
- #ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
- #endif
- .mmap = ext4_file_mmap,
- .open = ext4_file_open,
- .release = ext4_release_file,
- .fsync = ext4_sync_file,
- .get_unmapped_area = thp_get_unmapped_area,
- .splice_read = ext4_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fallocate = ext4_fallocate,
- .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_DIO_PARALLEL_WRITE,
- };
- const struct inode_operations ext4_file_inode_operations = {
- .setattr = ext4_setattr,
- .getattr = ext4_file_getattr,
- .listxattr = ext4_listxattr,
- .get_inode_acl = ext4_get_acl,
- .set_acl = ext4_set_acl,
- .fiemap = ext4_fiemap,
- .fileattr_get = ext4_fileattr_get,
- .fileattr_set = ext4_fileattr_set,
- };
|