From: Simon Glass <simon.glass@canonical.com> Copy file.c, fsmap.c, fsmap.h, and ialloc.c from Linux v6.18 fs/ext4 directory. - file: file operations (open, read, write, mmap, etc.) - fsmap: filesystem block mapping for GETFSMAP ioctl - ialloc: inode allocation Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/file.c | 995 ++++++++++++++++++++++++++++ fs/ext4l/fsmap.c | 792 ++++++++++++++++++++++ fs/ext4l/fsmap.h | 56 ++ fs/ext4l/ialloc.c | 1621 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 3464 insertions(+) create mode 100644 fs/ext4l/file.c create mode 100644 fs/ext4l/fsmap.c create mode 100644 fs/ext4l/fsmap.h create mode 100644 fs/ext4l/ialloc.c diff --git a/fs/ext4l/file.c b/fs/ext4l/file.c new file mode 100644 index 00000000000..7a8b3093218 --- /dev/null +++ b/fs/ext4l/file.c @@ -0,0 +1,995 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/dax.h> +#include <linux/quotaops.h> +#include <linux/pagevec.h> +#include <linux/uio.h> +#include <linux/mman.h> +#include <linux/backing-dev.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "truncate.h" + +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); + + if (dio_align == 0) + return false; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_should_use_dio(iocb, to)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_read_iter(iocb, to); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + unsigned long blockmask = sb->s_blocksize - 1; + + if ((pos | iov_iter_alignment(from)) & blockmask) + return true; + + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; +} + +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. + */ + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; +} + +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return count; +} + +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = generic_perform_write(iocb, from); + +out: + inode_unlock(inode); + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); +} + +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, ssize_t count) +{ + handle_t *handle; + + lockdep_assert_held_write(&inode->i_rwsem); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (ext4_update_inode_size(inode, offset + written)) { + int ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) { + ext4_journal_stop(handle); + return ret; + } + } + + if ((written == count) && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + return written; +} + +/* + * Clean up the inode after DIO or DAX extending write has completed and the + * inode size has been updated using ext4_handle_inode_extension(). + */ +static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) +{ + lockdep_assert_held_write(&inode->i_rwsem); + if (need_trunc) { + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + return; + } + /* + * If i_disksize got extended either due to writeback of delalloc + * blocks or extending truncate while the DIO was running we could fail + * to cleanup the orphan list in ext4_handle_inode_extension(). Do it + * now. + */ + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + /* + * The write has successfully completed. Not much to + * do with the error here so just cleanup the orphan + * list and hope for the best. + */ + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); + if (error) + return error; + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. Also we can race with truncate or write + * expanding the file so we have to be a bit careful here. + */ + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && + pos + size <= i_size_read(inode)) + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend, + bool *unwritten, int *dio_flags) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + bool overwrite, unaligned_io; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + + unaligned_io = ext4_unaligned_io(inode, from, offset); + *extend = ext4_extending_io(inode, offset, count); + overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + + /* + * Determine whether we need to upgrade to an exclusive lock. This is + * required to change security info in file_modified(), for extending + * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten + * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. + */ + if (*ilock_shared && + ((!IS_NOSEC(inode) || *extend || !overwrite || + (unaligned_io && *unwritten)))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + /* + * Now that locking is settled, determine dio flags and exclusivity + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. + */ + if (!*ilock_shared && (unaligned_io || *extend)) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + if (unaligned_io && (!overwrite || *unwritten)) + inode_dio_wait(inode); + *dio_flags = IOMAP_DIO_FORCE_WAIT; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unwritten = false; + bool ilock_shared = true; + int dio_flags = 0; + + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } + } else { + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); + } + + /* Fallback to buffered I/O if the inode does not support direct I/O. */ + if (!ext4_should_use_dio(iocb, from)) { + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ext4_buffered_write_iter(iocb, from); + } + + /* + * Prevent inline data from being created since we are going to allocate + * blocks for DIO. We know the inode does not currently have inline data + * because ext4_should_use_dio() checked for it, but we have to clear + * the state flag before the write checks because a lock cycle could + * introduce races with other writers. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, + &unwritten, &dio_flags); + if (ret <= 0) + return ret; + + offset = iocb->ki_pos; + count = ret; + + if (extend) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (ret) + goto out; + } + + if (ilock_shared && !unwritten) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + dio_flags, NULL, 0); + if (ret == -ENOTBLK) + ret = 0; + if (extend) { + /* + * We always perform extending DIO write synchronously so by + * now the IO is completed and ext4_handle_inode_extension() + * was called. Cleanup the inode in case of error or race with + * writeback of delalloc blocks. + */ + WARN_ON_ONCE(ret == -EIOCBQUEUED); + ext4_inode_extension_cleanup(inode, ret < 0); + } + +out: + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = iov_iter_count(from); + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + + if (extend) { + ret = ext4_handle_inode_extension(inode, offset, ret, count); + ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); + } +out: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + int ret; + struct inode *inode = file_inode(iocb->ki_filp); + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + else + return ext4_buffered_write_iter(iocb, from); +} + +#ifdef CONFIG_FS_DAX +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + int error = 0; + vm_fault_t result; + int retries = 0; + handle_t *handle = NULL; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for order != 0 (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pfn; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + filemap_invalidate_lock_shared(mapping); +retry: + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } + } else { + filemap_invalidate_lock_shared(mapping); + } + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); + if (write) { + ext4_journal_stop(handle); + + if ((result & VM_FAULT_ERROR) && error == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto retry; + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, order, pfn); + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(sb); + } else { + filemap_invalidate_unlock_shared(mapping); + } + + return result; +} + +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) +{ + return ext4_dax_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .huge_fault = ext4_dax_huge_fault, + .page_mkwrite = ext4_dax_fault, + .pfn_mkwrite = ext4_dax_fault, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + +static const struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap_prepare(struct vm_area_desc *desc) +{ + int ret; + struct file *file = desc->file; + struct inode *inode = file->f_mapping->host; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; + + /* + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. + */ + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) + return -EOPNOTSUPP; + + file_accessed(file); + if (IS_DAX(file_inode(file))) { + desc->vm_ops = &ext4_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; + } else { + desc->vm_ops = &ext4_file_vm_ops; + } + return 0; +} + +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct path path; + char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) + return 0; + + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) + return 0; + + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + err = 0; + if (IS_ERR(cp)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); + if (IS_ERR(handle)) + goto out; + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out_journal; + lock_buffer(sbi->s_sbh); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +out_journal: + ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); + return err; +} + +static int ext4_file_open(struct inode *inode, struct file *filp) +{ + int ret; + + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; + + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; + + ret = fscrypt_file_open(inode, filp); + if (ret) + return ret; + + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (filp->f_mode & FMODE_WRITE) { + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; + } + + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + return dquot_file_open(inode, filp); +} + +/* + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = ext4_get_maxbytes(inode); + + switch (whence) { + default: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_HOLE: + inode_lock_shared(inode); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); + inode_unlock_shared(inode); + break; + case SEEK_DATA: + inode_lock_shared(inode); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); + inode_unlock_shared(inode); + break; + } + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, maxbytes); +} + +const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, + .read_iter = ext4_file_read_iter, + .write_iter = ext4_file_write_iter, + .iopoll = iocb_bio_iopoll, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap_prepare = ext4_file_mmap_prepare, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .get_unmapped_area = thp_get_unmapped_area, + .splice_read = ext4_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, +}; + +const struct inode_operations ext4_file_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_file_getattr, + .listxattr = ext4_listxattr, + .get_inode_acl = ext4_get_acl, + .set_acl = ext4_set_acl, + .fiemap = ext4_fiemap, + .fileattr_get = ext4_fileattr_get, + .fileattr_set = ext4_fileattr_set, +}; + diff --git a/fs/ext4l/fsmap.c b/fs/ext4l/fsmap.c new file mode 100644 index 00000000000..22fc333244e --- /dev/null +++ b/fs/ext4l/fsmap.c @@ -0,0 +1,792 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include "mballoc.h" +#include <linux/sort.h> +#include <linux/list_sort.h> +#include <trace/events/ext4.h> + +/* Convert an ext4_fsmap to an fsmap. */ +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = 0; + dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an ext4_fsmap. */ +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; +} + +/* getfsmap query state */ +struct ext4_getfsmap_info { + struct ext4_fsmap_head *gfi_head; + ext4_fsmap_format_t gfi_formatter; /* formatting fn */ + void *gfi_format_arg;/* format buffer */ + ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ + u32 gfi_dev; /* device id */ + ext4_group_t gfi_agno; /* bg number, if applicable */ + struct ext4_fsmap gfi_low; /* low rmap key */ + struct ext4_fsmap gfi_high; /* high rmap key */ + struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ + struct list_head gfi_meta_list; /* fixed metadata list */ + bool gfi_last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct ext4_getfsmap_dev { + int (*gfd_fn)(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info); + u32 gfd_dev; +}; + +/* Compare two getfsmap device handlers. */ +static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) +{ + const struct ext4_getfsmap_dev *d1 = p1; + const struct ext4_getfsmap_dev *d2 = p2; + + return d1->gfd_dev - d2->gfd_dev; +} + +/* Compare a record against our starting point */ +static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + return rec->fmr_physical + rec->fmr_length <= + info->gfi_low.fmr_physical; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +static int ext4_getfsmap_helper(struct super_block *sb, + struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + struct ext4_fsmap fmr; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t rec_fsblk = rec->fmr_physical; + ext4_group_t agno; + ext4_grpblk_t cno; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (ext4_getfsmap_rec_before_low_key(info, rec)) { + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->gfi_head->fmh_count == 0) { + if (info->gfi_head->fmh_entries == UINT_MAX) + return EXT4_QUERY_RANGE_ABORT; + + if (rec_fsblk > info->gfi_next_fsblk) + info->gfi_head->fmh_entries++; + + if (info->gfi_last) + return EXT4_QUERY_RANGE_CONTINUE; + + info->gfi_head->fmh_entries++; + + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_fsblk > info->gfi_next_fsblk) { + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, + &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, + EXT4_C2B(sbi, cno), + rec_fsblk - info->gfi_next_fsblk, + EXT4_FMR_OWN_UNKNOWN); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = info->gfi_next_fsblk; + fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; + fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + } + + if (info->gfi_last) + goto out; + + /* Fill out the extent we found */ + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), + rec->fmr_length, rec->fmr_owner); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = rec_fsblk; + fmr.fmr_owner = rec->fmr_owner; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + fmr.fmr_length = rec->fmr_length; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + +out: + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; +} + +static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) +{ + return fmr->fmr_physical + fmr->fmr_length; +} + +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* + * Return relevant extents from the meta_list. We emit all extents that + * partially/fully overlap with the query range + */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_end && + p->fmr_physical + p->fmr_length > fs_start) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + +/* Transform a blockgroup's free record into a fsmap */ +static int ext4_getfsmap_datadev_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_fsmap irec; + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb; + ext4_fsblk_t fslen; + int error; + + fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); + fslen = EXT4_C2B(sbi, len); + + /* If the retained free extent record is set... */ + if (info->gfi_lastfree.fmr_owner) { + /* ...and abuts this one, lengthen it and return. */ + if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { + info->gfi_lastfree.fmr_length += fslen; + return 0; + } + + /* + * There's a gap between the two free extents; emit the + * retained extent prior to merging the meta_list. + */ + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + + /* Merge in any relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + } else if (p->fmr_physical < fsb) { + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + + list_del(&p->fmr_list); + kfree(p); + } + } + + irec.fmr_device = 0; + irec.fmr_physical = fsb; + irec.fmr_length = fslen; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + /* If this is a free extent at the end of a bg, buffer it. */ + if (ext4_fsmap_next_pblk(&irec) == + ext4_group_first_block_no(sb, agno + 1)) { + info->gfi_lastfree = irec; + return 0; + } + + /* Otherwise, emit it */ + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Execute a getfsmap query against the log device. */ +static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_fsmap irec; + + /* Set up search keys */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + irec.fmr_physical = journal->j_blk_offset; + irec.fmr_length = journal->j_total_len; + irec.fmr_owner = EXT4_FMR_OWN_LOG; + irec.fmr_flags = 0; + + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Helper to fill out an ext4_fsmap. */ +static inline int ext4_getfsmap_fill(struct list_head *meta_list, + ext4_fsblk_t fsb, ext4_fsblk_t len, + uint64_t owner) +{ + struct ext4_fsmap *fsm; + + fsm = kmalloc(sizeof(*fsm), GFP_NOFS); + if (!fsm) + return -ENOMEM; + fsm->fmr_device = 0; + fsm->fmr_flags = 0; + fsm->fmr_physical = fsb; + fsm->fmr_owner = owner; + fsm->fmr_length = len; + list_add_tail(&fsm->fmr_list, meta_list); + + return 0; +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, + ext4_group_t agno, + struct list_head *meta_list) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); + ext4_fsblk_t len; + unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); + int error; + + /* Record the superblock. */ + if (ext4_bg_has_super(sb, agno)) { + error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); + if (error) + return error; + fsb++; + } + + /* Record the group descriptors. */ + len = ext4_bg_num_gdb(sb, agno); + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_GDT); + if (error) + return error; + fsb += len; + + /* Reserved GDT blocks */ + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { + len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_RESV_GDT); + if (error) + return error; + } + + return 0; +} + +/* Compare two fsmap items. */ +static int ext4_getfsmap_compare(void *priv, + const struct list_head *a, + const struct list_head *b) +{ + struct ext4_fsmap *fa; + struct ext4_fsmap *fb; + + fa = container_of(a, struct ext4_fsmap, fmr_list); + fb = container_of(b, struct ext4_fsmap, fmr_list); + if (fa->fmr_physical < fb->fmr_physical) + return -1; + else if (fa->fmr_physical > fb->fmr_physical) + return 1; + return 0; +} + +/* Merge adjacent extents of fixed metadata. */ +static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *prev = NULL; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + if (!prev) { + prev = p; + continue; + } + + if (prev->fmr_owner == p->fmr_owner && + prev->fmr_physical + prev->fmr_length == p->fmr_physical) { + prev->fmr_length += p->fmr_length; + list_del(&p->fmr_list); + kfree(p); + } else + prev = p; + } +} + +/* Free a list of fixed metadata. */ +static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + list_del(&p->fmr_list); + kfree(p); + } +} + +/* Find all the fixed metadata in the filesystem. */ +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) +{ + struct ext4_group_desc *gdp; + ext4_group_t agno; + int error; + + INIT_LIST_HEAD(meta_list); + + /* Collect everything. */ + for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { + gdp = ext4_get_group_desc(sb, agno, NULL); + if (!gdp) { + error = -EFSCORRUPTED; + goto err; + } + + /* Superblock & GDT */ + error = ext4_getfsmap_find_sb(sb, agno, meta_list); + if (error) + goto err; + + /* Block bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_block_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_BLKBM); + if (error) + goto err; + + /* Inode bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_INOBM); + if (error) + goto err; + + /* Inodes */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group, + EXT4_FMR_OWN_INODES); + if (error) + goto err; + } + + /* Sort the list */ + list_sort(NULL, meta_list, ext4_getfsmap_compare); + + /* Merge adjacent extents */ + ext4_getfsmap_merge_fixed_metadata(meta_list); + + return 0; +err: + ext4_getfsmap_free_fixed_metadata(meta_list); + return error; +} + +/* Execute a getfsmap query against the buddy bitmaps */ +static int ext4_getfsmap_datadev(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start_fsb; + ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; + ext4_fsblk_t eofs; + ext4_group_t start_ag; + ext4_group_t end_ag; + ext4_grpblk_t first_cluster; + ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; + int error = 0; + + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); + eofs = ext4_blocks_count(sbi->s_es); + if (keys[0].fmr_physical >= eofs) + return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + if (keys[1].fmr_physical < keys[0].fmr_physical) + return 0; + start_fsb = keys[0].fmr_physical; + end_fsb = keys[1].fmr_physical; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); + ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); + + /* + * Convert the fsmap low/high keys to bg based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the bg. + */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + /* Assemble a list of all the fixed-location metadata. */ + error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); + if (error) + goto err; + + /* Query each bg */ + for (info->gfi_agno = start_ag; + info->gfi_agno <= end_ag; + info->gfi_agno++) { + /* + * Set the bg high key from the fsmap high key if this + * is the last bg that we're querying. + */ + if (info->gfi_agno == end_ag) { + info->gfi_high = keys[1]; + info->gfi_high.fmr_physical = EXT4_C2B(sbi, + last_cluster); + info->gfi_high.fmr_length = 0; + } + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + error = ext4_mballoc_query_range(sb, info->gfi_agno, + EXT4_B2C(sbi, info->gfi_low.fmr_physical), + EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, + ext4_getfsmap_datadev_helper, info); + if (error) + goto err; + + /* + * Set the bg low key to the start of the bg prior to + * moving on to the next bg. + */ + if (info->gfi_agno == start_ag) + memset(&info->gfi_low, 0, sizeof(info->gfi_low)); + } + + /* Do we have a retained free extent? */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + goto err; + } + + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + info->gfi_last = true; + error = ext4_getfsmap_helper(sb, info, &irec); + if (error) + goto err; + +err: + ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); + return error; +} + +/* Do we recognize the device? */ +static bool ext4_getfsmap_is_valid_device(struct super_block *sb, + struct ext4_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) + return true; + if (EXT4_SB(sb)->s_journal_bdev_file && + fm->fmr_device == + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, + struct ext4_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + return false; +} + +#define EXT4_GETFSMAP_DEVS 2 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide block addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this + * is how we detect gaps in the fsmap + * records and report them. + * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from + * dkeys; used to query the free space. + */ +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg) +{ + struct ext4_fsmap dkeys[2]; /* per-dev keys */ + struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; + struct ext4_getfsmap_info info = { NULL }; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || + !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); + handlers[0].gfd_fn = ext4_getfsmap_datadev; + if (EXT4_SB(sb)->s_journal_bdev_file) { + handlers[1].gfd_dev = new_encode_dev( + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); + handlers[1].gfd_fn = ext4_getfsmap_logdev; + } + + sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), + ext4_getfsmap_dev_compare, NULL); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * Bump the physical offset as there can be no other mapping for + * the same physical block range. + */ + dkeys[0] = head->fmh_keys[0]; + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); + + if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.gfi_formatter = formatter; + info.gfi_format_arg = arg; + info.gfi_head = head; + + /* For each device we support... */ + for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].gfd_fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); + + info.gfi_dev = handlers[i].gfd_dev; + info.gfi_last = false; + info.gfi_agno = -1; + error = handlers[i].gfd_fn(sb, dkeys, &info); + if (error) + break; + info.gfi_next_fsblk = 0; + } + + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/ext4l/fsmap.h b/fs/ext4l/fsmap.h new file mode 100644 index 00000000000..ac642be2302 --- /dev/null +++ b/fs/ext4l/fsmap.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __EXT4_FSMAP_H__ +#define __EXT4_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct ext4_fsmap { + struct list_head fmr_list; + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + uint64_t fmr_length; /* length of segment, blocks */ +}; + +struct ext4_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct ext4_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src); +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *); + +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg); + +#define EXT4_QUERY_RANGE_ABORT 1 +#define EXT4_QUERY_RANGE_CONTINUE 0 + +/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */ +#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ +#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */ + +#endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4l/ialloc.c b/fs/ext4l/ialloc.c new file mode 100644 index 00000000000..ba4fd9aba1c --- /dev/null +++ b/fs/ext4l/ialloc.c @@ -0,0 +1,1621 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/random.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/cred.h> + +#include <asm/byteorder.h> + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +#include <trace/events/ext4.h> + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +static int ext4_validate_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + if (buffer_verified(bh)) + return 0; + + grp = ext4_get_group_info(sb, block_group); + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; + blk = ext4_inode_bitmap(sb, desc); + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + set_buffer_verified(bh); +verified: + ext4_unlock_group(sb, block_group); + return 0; +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + int err; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return ERR_PTR(-EFSCORRUPTED); + + bitmap_blk = ext4_inode_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid inode bitmap blk %llu in " + "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return ERR_PTR(-EFSCORRUPTED); + } + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_warning(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return ERR_PTR(-ENOMEM); + } + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + + ext4_lock_group(sb, block_group); + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Inode bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + trace_ext4_load_inode_bitmap(sb, block_group); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); + if (!buffer_uptodate(bh)) { + put_bh(bh); + ext4_error_err(sb, EIO, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return ERR_PTR(-EIO); + } + +verify: + err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: + put_bh(bh); + return ERR_PTR(err); +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; + + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); + return; + } + if (icount_read(inode) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + icount_read(inode)); + return; + } + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + trace_ext4_free_inode(inode); + + dquot_initialize(inode); + dquot_free_inode(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + ext4_clear_inode(inode); + + es = sbi->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + /* Don't bother if the inode bitmap is corrupt. */ + if (IS_ERR(bitmap_bh)) { + fatal = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + goto error_return; + } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, block_group); + if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; + goto error_return; + } + } + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (fatal) + goto error_return; + + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, sb, bh2, + EXT4_JTR_NONE); + } + ext4_lock_group(sb, block_group); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } + + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + if (percpu_counter_initialized(&sbi->s_dirs_counter)) + percpu_counter_dec(&sbi->s_dirs_counter); + } + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); + + if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + struct flex_groups *fg; + + fg = sbi_array_rcu_deref(sbi, s_flex_groups, + ext4_flex_group(sbi, block_group)); + atomic_inc(&fg->free_inodes); + if (is_directory) + atomic_dec(&fg->used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + } else { + ext4_error(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +struct orlov_stats { + __u64 free_clusters; + __u32 free_inodes; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + + if (flex_size > 1) { + struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), + s_flex_groups, g); + stats->free_inodes = atomic_read(&fg->free_inodes); + stats->free_clusters = atomic64_read(&fg->free_clusters); + stats->used_dirs = atomic_read(&fg->used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_clusters = 0; + stats->used_dirs = 0; + } +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free clusters counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free clusters left (min_clusters) or + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode, + const struct qstr *qstr) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t real_ngroups = ext4_get_groups_count(sb); + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei, grp_free; + ext4_fsblk_t freec, avefreec; + unsigned int ndirs; + int max_dirs, min_inodes; + ext4_grpblk_t min_clusters; + ext4_group_t i, grp, g, ngroups; + struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; + + ngroups = real_ngroups; + if (flex_size > 1) { + ngroups = (real_ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); + avefreec = freec; + do_div(avefreec, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if (S_ISDIR(mode) && + ((parent == d_inode(sb->s_root)) || + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); + parent_group = hinfo.hash % ngroups; + } else + parent_group = get_random_u32_below(ngroups); + for (i = 0; i < ngroups; i++) { + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) + continue; + if (stats.used_dirs >= best_ndir) + continue; + if (stats.free_inodes < avefreei) + continue; + if (stats.free_clusters < avefreec) + continue; + grp = g; + ret = 0; + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= real_ngroups) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + if (min_clusters < 0) + min_clusters = 0; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } + + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) + continue; + if (stats.free_inodes < min_inodes) + continue; + if (stats.free_clusters < min_clusters) + continue; + goto found_flex_bg; + } + +fallback: + ngroups = real_ngroups; + avefreei = freei / ngroups; +fallback_retry: + parent_group = EXT4_I(parent)->i_block_group; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } + } + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback_retry; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode, NULL); + } + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) + return 0; + } + + return -1; +} + +/* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 60 +#define RECENTCY_DIRTY 300 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (!bh || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ + dtime = le32_to_cpu(raw_inode->i_dtime); + now = ktime_get_real_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ + bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; + unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); + +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + goto not_found; + + if (check_recently_deleted && recently_deleted(sb, group, *ino)) { + recently_deleted_ino = *ino; + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + goto not_found; + } + return 1; +not_found: + if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + /* + * Not reusing recently deleted inodes is mostly a preference. We don't + * want to report ENOSPC or skew allocation patterns because of that. + * So return even recently deleted inode if we could find better in the + * given range. + */ + *ino = recently_deleted_ino; + return 1; +} + +int ext4_mark_inode_used(struct super_block *sb, int ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; + struct ext4_group_desc *gdp; + ext4_group_t group; + int bit; + int err; + + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + return -EFSCORRUPTED; + + group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (IS_ERR(inode_bitmap_bh)) + return PTR_ERR(inode_bitmap_bh); + + if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { + err = 0; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) { + err = -EINVAL; + goto out; + } + + ext4_set_bit(bit, inode_bitmap_bh->b_data); + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + err = sync_dirty_buffer(inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); + sync_dirty_buffer(block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (bit >= free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - bit - 1)); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + + ext4_unlock_group(sb, group); + err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); + sync_dirty_buffer(group_desc_bh); +out: + brelse(inode_bitmap_bh); + return err; +} + +static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, + bool encrypt) +{ + struct super_block *sb = dir->i_sb; + int nblocks = 0; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) + return PTR_ERR(p); + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never more than 1k. + * In practice they are under 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, + NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + return nblocks; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *__ext4_new_inode(struct mnt_idmap *idmap, + handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) +{ + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; + ext4_group_t ngroups, group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err; + struct inode *ret; + ext4_group_t i; + ext4_group_t flex_group; + struct ext4_group_info *grp = NULL; + bool encrypt = false; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); + + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + + /* + * Initialize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode_fsuid_set(inode, idmap); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(idmap, inode, dir, mode); + + if (ext4_has_feature_project(sb) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + + if (!(i_flags & EXT4_EA_INODE_FL)) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto out; + } + + err = dquot_initialize(inode); + if (err) + goto out; + + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { + ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); + if (ret2 < 0) { + err = ret2; + goto out; + } + nblocks += ret2; + } + + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else + ret2 = find_group_other(sb, dir, &group, mode); + +got_group: + EXT4_I(dir)->i_last_alloc_group = group; + err = -ENOSPC; + if (ret2 == -1) + goto out; + + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ + for (i = 0; i < ngroups; i++, ino = 0) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + /* + * Skip groups with already-known suspicious inode + * tables + */ + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + } + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + /* Skip groups with suspicious inode tables */ + if (IS_ERR(inode_bitmap_bh)) { + inode_bitmap_bh = NULL; + goto next_group; + } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY) && + EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) + goto next_group; + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + goto next_group; + } + + if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; + } + } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, + EXT4_JTR_NONE); + if (err) { + ext4_std_error(sb, err); + goto out; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + +next_group: + if (++group == ngroups) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); + goto out; + } + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, + EXT4_JTR_NONE); + if (err) { + brelse(block_bitmap_bh); + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + struct ext4_group_info *grp = NULL; + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + if (!grp) { + err = -EFSCORRUPTED; + goto out; + } + down_read(&grp->alloc_sem); /* + * protect vs itable + * lazyinit + */ + } + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, + f)->used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_inodes); + } + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* Don't inherit extent flag from directory, amongst others. */ + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; + + ext4_set_inode_flags(inode, true); + if (IS_DIRSYNC(inode)) + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + goto out; + } + inode->i_generation = get_random_u32(); + + /* Precompute checksum seed for inode metadata */ + if (ext4_has_feature_metadata_csum(sb)) { + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); + } + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = sbi->s_want_extra_isize; + ei->i_inline_off = 0; + if (ext4_has_feature_inline_data(sb) && + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_set_context(inode, handle); + if (err) + goto fail_free_drop; + } + + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } + + if (ext4_has_feature_extents(sb)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + + ext4_set_inode_mapping_order(inode); + + ext4_update_inode_fsync_trans(handle, inode, 1); + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_ext4_allocate_inode(inode, dir, mode); + brelse(inode_bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); +fail_drop: + clear_nlink(inode); + unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + brelse(inode_bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh = NULL; + struct inode *inode = NULL; + int err = -EFSCORRUPTED; + + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (IS_ERR(bitmap_bh)) + return ERR_CAST(bitmap_bh); + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error_err(sb, -err, + "couldn't read orphan inode %lu (err %d)", + ino, err); + brelse(bitmap_bh); + return inode; + } + + /* + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. + */ + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +bad_orphan: + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + if (inode) { + printk(KERN_ERR "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; + continue; + } + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += ext4_used_dirs_count(sb, gdp); + } + return count; +} + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_new_inode() until we are finished. + */ +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + unsigned long used_inos = 0; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp || !grp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + used_inos = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); + + /* Bogus inode unused count? */ + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + used_inos += group * EXT4_INODES_PER_GROUP(sb); + /* + * Are there some uninitialized inodes in the inode table + * before the first normal inode? + */ + if ((used_blks != sbi->s_itb_per_group) && + (used_inos < EXT4_FIRST_INO(sb))) { + ext4_error(sb, "Something is wrong with group %u: " + "itable unused count: %u; " + "itables initialized count: %ld", + group, ext4_itable_unused_count(sb, gdp), + used_inos); + ret = 1; + goto err_out; + } + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} -- 2.43.0