[PATCH 00/17] ext4l: Begin an implementation of ext4 based on Linux
From: Simon Glass <simon.glass@canonical.com> U-Boot has long had a solid implementation of ext4. Support for writing and basic journaling was added as well. However, there are some missing features. For example, some distros enable metadata checksums which are not supported. There also also other features such as fast commit and special feature for small files which are not currently available. Linux has the canonical implementation of ext4, so one approach would be to plumb that code into U-Boot. It is not a small undertaking, since the existing linux/compat.h has only a small fraction of the required features. As a start towards this effort, bring in the entire Linux code, unmodified. Note: There are 1000s of checkpatch checks and warnings, plus over 100 errors. These have been left along to reduce the source delta. Simon Glass (17): ext4l: bring in extents.c ext4l: bring in inode.c ext4l: bring in mballoc.c ext4l: bring in namei.c ext4l: bring in super.c ext4l: bring in ext4.h ext4l: bring in files a-d ext4l: bring in extents_status, fsync, and hash ext4l: bring in ext4_jbd2 and fast_commit ext4l: bring in file, fsmap, and ialloc ext4l: bring in indirect and inline ext4l: bring in inode-test, ioctl, and mballoc headers ext4l: bring in migrate, mmp, move_extent, orphan, and page-io ext4l: bring in resize, symlink, sysfs, truncate, verity, and readpage ext4l: bring in xattr files ext4l: bring in Kconfig and Makefile ext4l: bring in jbd2 header file fs/ext4l/Kconfig | 92 + fs/ext4l/Makefile | 20 + fs/ext4l/acl.c | 304 ++ fs/ext4l/acl.h | 74 + fs/ext4l/balloc.c | 1003 +++++ fs/ext4l/bitmap.c | 99 + fs/ext4l/block_validity.c | 370 ++ fs/ext4l/crypto.c | 241 ++ fs/ext4l/dir.c | 693 ++++ fs/ext4l/ext4.h | 3932 +++++++++++++++++++ fs/ext4l/ext4_extents.h | 268 ++ fs/ext4l/ext4_jbd2.c | 408 ++ fs/ext4l/ext4_jbd2.h | 461 +++ fs/ext4l/extents.c | 6189 ++++++++++++++++++++++++++++++ fs/ext4l/extents_status.c | 2306 ++++++++++++ fs/ext4l/extents_status.h | 254 ++ fs/ext4l/fast_commit.c | 2343 ++++++++++++ fs/ext4l/fast_commit.h | 186 + fs/ext4l/file.c | 995 +++++ fs/ext4l/fsmap.c | 792 ++++ fs/ext4l/fsmap.h | 56 + fs/ext4l/fsync.c | 177 + fs/ext4l/hash.c | 323 ++ fs/ext4l/ialloc.c | 1621 ++++++++ fs/ext4l/indirect.c | 1474 ++++++++ fs/ext4l/inline.c | 1982 ++++++++++ fs/ext4l/inode-test.c | 283 ++ fs/ext4l/inode.c | 6756 +++++++++++++++++++++++++++++++++ fs/ext4l/ioctl.c | 2020 ++++++++++ fs/ext4l/mballoc-test.c | 999 +++++ fs/ext4l/mballoc.c | 7175 +++++++++++++++++++++++++++++++++++ fs/ext4l/mballoc.h | 273 ++ fs/ext4l/migrate.c | 672 ++++ fs/ext4l/mmp.c | 408 ++ fs/ext4l/move_extent.c | 706 ++++ fs/ext4l/namei.c | 4241 +++++++++++++++++++++ fs/ext4l/orphan.c | 659 ++++ fs/ext4l/page-io.c | 593 +++ fs/ext4l/readpage.c | 422 +++ fs/ext4l/resize.c | 2193 +++++++++++ fs/ext4l/super.c | 7516 +++++++++++++++++++++++++++++++++++++ fs/ext4l/symlink.c | 136 + fs/ext4l/sysfs.c | 648 ++++ fs/ext4l/truncate.h | 52 + fs/ext4l/verity.c | 399 ++ fs/ext4l/xattr.c | 3225 ++++++++++++++++ fs/ext4l/xattr.h | 236 ++ fs/ext4l/xattr_hurd.c | 52 + fs/ext4l/xattr_security.c | 66 + fs/ext4l/xattr_trusted.c | 47 + fs/ext4l/xattr_user.c | 50 + include/linux/jbd2.h | 1815 +++++++++ 52 files changed, 68305 insertions(+) create mode 100644 fs/ext4l/Kconfig create mode 100644 fs/ext4l/Makefile create mode 100644 fs/ext4l/acl.c create mode 100644 fs/ext4l/acl.h create mode 100644 fs/ext4l/balloc.c create mode 100644 fs/ext4l/bitmap.c create mode 100644 fs/ext4l/block_validity.c create mode 100644 fs/ext4l/crypto.c create mode 100644 fs/ext4l/dir.c create mode 100644 fs/ext4l/ext4.h create mode 100644 fs/ext4l/ext4_extents.h create mode 100644 fs/ext4l/ext4_jbd2.c create mode 100644 fs/ext4l/ext4_jbd2.h create mode 100644 fs/ext4l/extents.c create mode 100644 fs/ext4l/extents_status.c create mode 100644 fs/ext4l/extents_status.h create mode 100644 fs/ext4l/fast_commit.c create mode 100644 fs/ext4l/fast_commit.h create mode 100644 fs/ext4l/file.c create mode 100644 fs/ext4l/fsmap.c create mode 100644 fs/ext4l/fsmap.h create mode 100644 fs/ext4l/fsync.c create mode 100644 fs/ext4l/hash.c create mode 100644 fs/ext4l/ialloc.c create mode 100644 fs/ext4l/indirect.c create mode 100644 fs/ext4l/inline.c create mode 100644 fs/ext4l/inode-test.c create mode 100644 fs/ext4l/inode.c create mode 100644 fs/ext4l/ioctl.c create mode 100644 fs/ext4l/mballoc-test.c create mode 100644 fs/ext4l/mballoc.c create mode 100644 fs/ext4l/mballoc.h create mode 100644 fs/ext4l/migrate.c create mode 100644 fs/ext4l/mmp.c create mode 100644 fs/ext4l/move_extent.c create mode 100644 fs/ext4l/namei.c create mode 100644 fs/ext4l/orphan.c create mode 100644 fs/ext4l/page-io.c create mode 100644 fs/ext4l/readpage.c create mode 100644 fs/ext4l/resize.c create mode 100644 fs/ext4l/super.c create mode 100644 fs/ext4l/symlink.c create mode 100644 fs/ext4l/sysfs.c create mode 100644 fs/ext4l/truncate.h create mode 100644 fs/ext4l/verity.c create mode 100644 fs/ext4l/xattr.c create mode 100644 fs/ext4l/xattr.h create mode 100644 fs/ext4l/xattr_hurd.c create mode 100644 fs/ext4l/xattr_security.c create mode 100644 fs/ext4l/xattr_trusted.c create mode 100644 fs/ext4l/xattr_user.c create mode 100644 include/linux/jbd2.h -- 2.43.0 base-commit: 7764e43798d244dc088e0a6eb2a19bc9c82c1764 branch: exta
From: Simon Glass <simon.glass@canonical.com> Copy Kconfig and Makefile from Linux v6.18 fs/ext4 directory. - Kconfig: configuration options for ext4 - Makefile: build rules Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/Kconfig | 92 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4l/Makefile | 20 +++++++++++ 2 files changed, 112 insertions(+) create mode 100644 fs/ext4l/Kconfig create mode 100644 fs/ext4l/Makefile diff --git a/fs/ext4l/Kconfig b/fs/ext4l/Kconfig new file mode 100644 index 00000000000..01873c2a34a --- /dev/null +++ b/fs/ext4l/Kconfig @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0-only +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD + select JBD2 + select CRC16 + select CRC32 + select FS_IOMAP + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem supports mounting an ext3 filesystem; while there + are some performance gains from the delayed allocation and inode + table readahead, the best performance gains require enabling ext4 + features in the filesystem using tune2fs, or formatting a new + filesystem as an ext4 filesystem initially. Without explicit enabling + of ext4 features, the on disk filesystem format stays fully backward + compatible. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4_USE_FOR_EXT2 + bool "Use ext4 for ext2 file systems" + depends on EXT4_FS + depends on EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 + file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT4_DEBUG + bool "Ext4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + using dynamic debug control for mb_debug() / ext_debug() msgs. + +config EXT4_KUNIT_TESTS + tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS + depends on EXT4_FS && KUNIT + default KUNIT_ALL_TESTS + help + This builds the ext4 KUnit tests. + + KUnit tests run during boot and output the results to the debug log + in TAP format (https://testanything.org/). Only useful for kernel devs + running KUnit test harness and are not for inclusion into a production + build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile new file mode 100644 index 00000000000..72206a29267 --- /dev/null +++ b/fs/ext4l/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux ext4-filesystem routines. +# + +obj-$(CONFIG_EXT4_FS) += ext4.o + +ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ + xattr_user.o fast_commit.o orphan.o + +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-inode-test-objs += inode-test.o +obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o +ext4-$(CONFIG_FS_VERITY) += verity.o +ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy extents.c and ext4_extents.h from Linux v6.18 fs/ext4 directory. These files implement the extent-based block mapping used by ext4. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/ext4_extents.h | 268 ++ fs/ext4l/extents.c | 6189 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 6457 insertions(+) create mode 100644 fs/ext4l/ext4_extents.h create mode 100644 fs/ext4l/extents.c diff --git a/fs/ext4l/ext4_extents.h b/fs/ext4l/ext4_extents.h new file mode 100644 index 00000000000..c484125d963 --- /dev/null +++ b/fs/ext4l/ext4_extents.h @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + * For non-inode extent blocks, ext4_extent_tail + * follows the array. + */ + +/* + * This is the extent tail on-disk structure. + * All other extent structures are 12 bytes long. It turns out that + * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which + * covers all valid ext4 block sizes. Therefore, this tail structure can be + * crammed into the end of the block without having to rebalance the tree. + */ +struct ext4_extent_tail { + __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ +}; + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_MAX_EXTENT_DEPTH 5 + +#define EXT4_EXTENT_TAIL_OFFSET(hdr) \ + (sizeof(struct ext4_extent_header) + \ + (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) + +static inline struct ext4_extent_tail * +find_ext4_extent_tail(struct ext4_extent_header *eh) +{ + return (struct ext4_extent_tail *)(((void *)eh) + + EXT4_EXTENT_TAIL_OFFSET(eh)); +} + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + __u16 p_maxdepth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + +/* + * structure for external API + */ + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an unwritten (i.e. + * preallocated). + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : NULL) +#define EXT_MAX_INDEX(__hdr__) \ + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : NULL) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) +{ + /* We can not have an unwritten extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +#endif /* _EXT4_EXTENTS */ + diff --git a/fs/ext4l/extents.c b/fs/ext4l/extents.c new file mode 100644 index 00000000000..ca5499e9412 --- /dev/null +++ b/fs/ext4l/extents.c @@ -0,0 +1,6189 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer <pierre.peiffer@bull.net> + */ + +/* + * Extents support for EXT4 + * + * TODO: + * - ext4*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/fiemap.h> +#include <linux/iomap.h> +#include <linux/sched/mm.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "xattr.h" + +#include <trace/events/ext4.h> + +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ + +#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ +#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ + +static __le32 ext4_extent_block_csum(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, + EXT4_EXTENT_TAIL_OFFSET(eh)); + return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return 1; + + et = find_ext4_extent_tail(eh); + if (et->et_checksum != ext4_extent_block_csum(inode, eh)) + return 0; + return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + et = find_ext4_extent_tail(eh); + et->et_checksum = ext4_extent_block_csum(inode, eh); +} + +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags); + +static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) +{ + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_rwsem. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + +static inline void ext4_ext_path_brelse(struct ext4_ext_path *path) +{ + brelse(path->p_bh); + path->p_bh = NULL; +} + +static void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth, i; + + if (IS_ERR_OR_NULL(path)) + return; + depth = path->p_depth; + for (i = 0; i <= depth; i++, path++) + ext4_ext_path_brelse(path); +} + +void ext4_free_ext_path(struct ext4_ext_path *path) +{ + if (IS_ERR_OR_NULL(path)) + return; + ext4_ext_drop_refs(path); + kfree(path); +} + +/* + * Make sure 'handle' has at least 'check_cred' credits. If not, restart + * transaction with 'restart_cred' credits. The function drops i_data_sem + * when restarting transaction and gets it after transaction is restarted. + * + * The function returns 0 on success, 1 if transaction had to be restarted, + * and < 0 in case of fatal error. + */ +int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, + revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + return ret; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext4_ext_get_access(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err = 0; + + if (path->p_bh) { + /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + path->p_bh, EXT4_JTR_NONE); + /* + * The extent buffer's verified bit will be set again in + * __ext4_ext_dirty(). We could leave an inconsistent + * buffer if the extents updating procudure break off du + * to some error happens, force to check it again. + */ + if (!err) + clear_buffer_verified(path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return err; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (path->p_bh) { + ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); + /* path points to block */ + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); + /* Extents updating done, re-set verified flag */ + if (!err) + set_buffer_verified(path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext4_mark_inode_dirty(handle, inode); + } + return err; +} + +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) + +static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block) +{ + if (path) { + int depth = path->p_depth; + struct ext4_extent *ex; + + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especially if the latter case turns out to be + * common. + */ + ex = path[depth].p_ext; + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } + + /* it looks like index is empty; + * try to find starting block from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + return ext4_inode_to_goal_block(inode); +} + +/* + * Allocation for a meta data block + */ +static ext4_fsblk_t +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex, int *err, unsigned int flags) +{ + ext4_fsblk_t goal, newblock; + + goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); + return newblock; +} + +static inline int ext4_ext_space_block(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 6) + size = 6; +#endif + return size; +} + +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 5) + size = 5; +#endif + return size; +} + +static inline int ext4_ext_space_root(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 3) + size = 3; +#endif + return size; +} + +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 4) + size = 4; +#endif + return size; +} + +static inline struct ext4_ext_path * +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t lblk, + int nofail) +{ + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + + if (nofail) + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; + + return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + flags); +} + +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode, 1); + else + max = ext4_ext_space_root_idx(inode, 1); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode, 1); + else + max = ext4_ext_space_block_idx(inode, 1); + } + + return max; +} + +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext4_ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) + return 0; + return ext4_inode_block_valid(inode, block, len); +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); + + return ext4_inode_block_valid(inode, block, 1); +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + ext4_lblk_t lblk, ext4_fsblk_t *pblk, + int depth) +{ + unsigned short entries; + ext4_lblk_t lblock = 0; + ext4_lblk_t cur = 0; + + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + + /* + * The logical block in the first entry should equal to + * the number in the index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext->ee_block)) + return 0; + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + if (lblock < cur) { + *pblk = ext4_ext_pblock(ext); + return 0; + } + cur = lblock + ext4_ext_get_actual_len(ext); + ext++; + entries--; + } + } else { + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + + /* + * The logical block in the first entry should equal to + * the number in the parent index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext_idx->ei_block)) + return 0; + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + + /* Check for overlapping index extents */ + lblock = le32_to_cpu(ext_idx->ei_block); + if (lblock < cur) { + *pblk = ext4_idx_pblock(ext_idx); + return 0; + } + ext_idx++; + entries--; + cur = lblock + 1; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) +{ + const char *error_msg; + int max = 0, err = -EFSCORRUPTED; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { + error_msg = "eh_entries is 0 but eh_depth is > 0"; + goto corrupted; + } + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } + /* Verify checksum on non-root extent tree nodes */ + if (ext_depth(inode) != depth && + !ext4_extent_block_csum_verify(inode, eh)) { + error_msg = "extent tree corrupted"; + err = -EFSBADCRC; + goto corrupted; + } + return 0; + +corrupted: + ext4_error_inode_err(inode, function, line, 0, -err, + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), + le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + return err; +} + +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); +} + +static void ext4_cache_extents(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } +} + +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_idx *idx, + int depth, int flags) +{ + struct buffer_head *bh; + int err; + gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; + ext4_fsblk_t pblk; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + + pblk = ext4_idx_pblock(idx); + bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = ext4_read_bh(bh, 0, NULL, false); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), + depth, pblk, le32_to_cpu(idx->ei_block)); + if (err) + goto errout; + set_buffer_verified(bh); + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + ext4_cache_extents(inode, eh); + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + +} + +#define read_extent_tree_block(inode, idx, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + ext4_check_map_extents_env(inode); + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + /* Don't cache anything if there are no external extent blocks */ + if (!depth) { + up_read(&ei->i_data_sem); + return ret; + } + + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + ext4_ext_path_brelse(path + i); + i--; + continue; + } + bh = read_extent_tree_block(inode, path[i].p_idx++, + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_free_ext_path(path); + return ret; +} + +#ifdef EXT_DEBUG +static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug(inode, "path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(inode, " %d->%llu", + le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(inode, " %d:[%d]%d:%llu ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_unwritten(path->p_ext), + ext4_ext_get_actual_len(path->p_ext), + ext4_ext_pblock(path->p_ext)); + } else + ext_debug(inode, " []"); + } + ext_debug(inode, "\n"); +} + +static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext4_extent_header *eh; + struct ext4_extent *ex; + int i; + + if (IS_ERR_OR_NULL(path)) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + ext_debug(inode, "Displaying leaf extents\n"); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); + } + ext_debug(inode, "\n"); +} + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug(inode, "%d: move %d:%llu in new index %llu\n", + level, le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + +#else +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) +#endif + +/* + * ext4_ext_binsearch_idx: + * binary search for the closest index of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch_idx(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent_idx *r, *l, *m; + + + ext_debug(inode, "binsearch for %u(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_LAST_INDEX(eh); + while (l <= r) { + m = l + (r - l) / 2; + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); + + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + } + + path->p_idx = l - 1; + ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && le32_to_cpu(ix->ei_block) <= + le32_to_cpu(ix[-1].ei_block)) { + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * ext4_ext_binsearch: + * binary search for closest extent of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug(inode, "binsearch for %u: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_LAST_EXTENT(eh); + + while (l <= r) { + m = l + (r - l) / 2; + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); + + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + } + + path->p_ext = l - 1; + ext_debug(inode, " -> %d:%llu:[%d]%d ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_pblock(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), + ext4_ext_get_actual_len(path->p_ext)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +void ext4_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext4_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + eh->eh_generation = 0; + ext4_mark_inode_dirty(handle, inode); +} + +struct ext4_ext_path * +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path *path, int flags) +{ + struct ext4_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0; + int ret; + gfp_t gfp_flags = GFP_NOFS; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + + eh = ext_inode_hdr(inode); + depth = ext_depth(inode); + if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { + EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", + depth); + ret = -EFSCORRUPTED; + goto err; + } + + if (path) { + ext4_ext_drop_refs(path); + if (depth > path[0].p_maxdepth) { + kfree(path); + path = NULL; + } + } + if (!path) { + /* account possible depth increase */ + path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), + gfp_flags); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + path[0].p_maxdepth = depth + 1; + } + path[0].p_hdr = eh; + path[0].p_bh = NULL; + + i = depth; + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) + ext4_cache_extents(inode, eh); + /* walk through the tree */ + while (i) { + ext_debug(inode, "depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext4_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + goto err; + } + + eh = ext_block_hdr(bh); + ppos++; + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + } + + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + + ext4_ext_show_path(inode, path); + + return path; + +err: + ext4_free_ext_path(path); + return ERR_PTR(ret); +} + +/* + * ext4_ext_insert_index: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) +{ + struct ext4_extent_idx *ix; + int len, err; + + err = ext4_ext_get_access(handle, inode, curp); + if (err) + return err; + + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EFSCORRUPTED; + } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EFSCORRUPTED; + } + + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + ext_debug(inode, "insert new index %d after: %llu\n", + logical, ptr); + ix = curp->p_idx + 1; + } else { + /* insert before */ + ext_debug(inode, "insert new index %d before: %llu\n", + logical, ptr); + ix = curp->p_idx; + } + + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EFSCORRUPTED; + } + + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug(inode, "insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + + ix->ei_block = cpu_to_le32(logical); + ext4_idx_store_pblock(ix, ptr); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); + + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EFSCORRUPTED; + } + + err = ext4_ext_dirty(handle, inode, curp); + ext4_std_error(inode->i_sb, err); + + return err; +} + +/* + * ext4_ext_split: + * inserts new subtree into the path, using free index entry + * at depth @at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extents and index entries (right to the split point) + * into the newly allocated blocks + * - initializes subtree + */ +static int ext4_ext_split(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + int i = at, k, m, a; + ext4_fsblk_t newblock, oldblock; + __le32 border; + ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + gfp_t gfp_flags = GFP_NOFS; + int err = 0; + size_t ext_size = 0; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + + /* make decision: where to split? */ + /* FIXME: now decision is simplest: at current extent */ + + /* if current leaf will be split, then we should use + * border from split point */ + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EFSCORRUPTED; + } + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug(inode, "leaf will be split." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug(inode, "leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * If error occurs, then we break processing + * and mark filesystem read-only. index won't + * be inserted and tree will be in consistent + * state. Next mount will repair buffers too. + */ + + /* + * Get array to track all allocated blocks. + * We need this to handle errors and free blocks + * upon them. + */ + ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); + if (!ablocks) + return -ENOMEM; + + /* allocate all needed blocks */ + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EFSCORRUPTED; + goto cleanup; + } + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); + if (unlikely(!bh)) { + err = -ENOMEM; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_depth = 0; + neh->eh_generation = 0; + + /* move remainder of path[depth] to the new leaf */ + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EFSCORRUPTED; + goto cleanup; + } + /* start copy from next extent */ + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); + if (m) { + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); + le16_add_cpu(&neh->eh_entries, m); + } + + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EFSCORRUPTED; + goto cleanup; + } + if (k) + ext_debug(inode, "create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (unlikely(!bh)) { + err = -ENOMEM; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + neh->eh_depth = cpu_to_le16(depth - i); + neh->eh_generation = 0; + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext4_idx_store_pblock(fidx, oldblock); + + ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", + i, newblock, le32_to_cpu(border), oldblock); + + /* move remainder of path[i] to the new index block */ + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EFSCORRUPTED; + goto cleanup; + } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); + if (m) { + memmove(++fidx, path[i].p_idx, + sizeof(struct ext4_extent_idx) * m); + le16_add_cpu(&neh->eh_entries, m); + } + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); + memset(bh->b_data + ext_size, 0, + inode->i_sb->s_blocksize - ext_size); + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + err = ext4_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); + } + } + kfree(ablocks); + + return err; +} + +/* + * ext4_ext_grow_indepth: + * implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initializes new top-level, creating index that points to the + * just created block + */ +static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, + unsigned int flags) +{ + struct ext4_extent_header *neh; + struct buffer_head *bh; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + int err = 0; + size_t ext_size = 0; + + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); + if (newblock == 0) + return err; + + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); + if (unlikely(!bh)) + return -ENOMEM; + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) { + unlock_buffer(bh); + goto out; + } + + ext_size = sizeof(EXT4_I(inode)->i_data); + /* move top-level index/leaf into new block */ + memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); + /* zero out unused area in the extent block */ + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + else + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto out; + + /* Update top-level index: num,max,pointer */ + neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); + + le16_add_cpu(&neh->eh_depth, 1); + err = ext4_mark_inode_dirty(handle, inode); +out: + brelse(bh); + + return err; +} + +/* + * ext4_ext_create_new_leaf: + * finds empty index and adds new leaf. + * if no free index is found, then it requests in-depth growing. + */ +static struct ext4_ext_path * +ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int mb_flags, unsigned int gb_flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp; + int depth, i, err = 0; + ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block); + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block, + * so subsequent data blocks should be contiguous */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); + if (err) + goto errout; + + /* refill path */ + path = ext4_find_extent(inode, ee_block, path, gb_flags); + return path; + } + + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, mb_flags); + if (err) + goto errout; + + /* refill path */ + path = ext4_find_extent(inode, ee_block, path, gb_flags); + if (IS_ERR(path)) + return path; + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth, ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EFSCORRUPTED; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EFSCORRUPTED; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), + depth); + return -EFSCORRUPTED; + } + } + return 0; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EFSCORRUPTED; + } + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; + return 0; +} + +/* + * Search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys. + * If not exists, return 0 and @phys is set to 0. We will return + * 1 which means we found an allocated block and ret_ex is valid. + * Or return a (< 0) error code. + */ +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent *ret_ex, int flags) +{ + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EFSCORRUPTED; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EFSCORRUPTED; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EFSCORRUPTED; + } + } + goto found_extent; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EFSCORRUPTED; + } + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + goto found_extent; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + goto got_index; + } + + /* we've gone up to the root and found no index to the right */ + return 0; + +got_index: + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + while (++depth < path->p_depth) { + /* subtract from p_depth to get proper eh_depth */ + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); + ix = EXT_FIRST_INDEX(eh); + put_bh(bh); + } + + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); + ex = EXT_FIRST_EXTENT(eh); +found_extent: + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + if (ret_ex) + *ret_ex = *ex; + if (bh) + put_bh(bh); + return 1; +} + +/* + * ext4_ext_next_allocated_block: + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. + * NOTE: it considers block number from index entry as + * allocated block. Thus, index entries have to be consistent + * with leaves. + */ +ext4_lblk_t +ext4_ext_next_allocated_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCKS; + + while (depth >= 0) { + struct ext4_ext_path *p = &path[depth]; + + if (depth == path->p_depth) { + /* leaf */ + if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) + return le32_to_cpu(p->p_ext[1].ee_block); + } else { + /* index */ + if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) + return le32_to_cpu(p->p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_next_leaf_block: + * returns first allocated block from next leaf or EXT_MAX_BLOCKS + */ +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCKS; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return (ext4_lblk_t) + le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_correct_indexes: + * if leaf gets modified and modified extent is first in the leaf, + * then we have to correct all indexes above. + * TODO: do we need to correct tree in all cases? + */ +static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + int depth = ext_depth(inode); + struct ext4_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EFSCORRUPTED; + } + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller than current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + goto clean; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + goto clean; + } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); + + return err; +} + +static int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2) +{ + unsigned short ext1_ee_len, ext2_ee_len; + + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != + le32_to_cpu(ex2->ee_block)) + return 0; + + if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) + return 0; + + if (ext4_ext_is_unwritten(ex1) && + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) + return 0; +#ifdef AGGRESSIVE_TEST + if (ext1_ee_len >= 4) + return 0; +#endif + + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +static int ext4_ext_try_to_merge_right(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0, unwritten; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + le16_add_cpu(&eh->eh_entries, -1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); + } + + return merge_done; +} + +/* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2, + ext4_free_metadata_revoke_credits(inode->i_sb, 1))) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + path[1].p_maxdepth = path[0].p_maxdepth; + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + ext4_ext_path_brelse(path + 1); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); +} + +/* + * This function tries to merge the @ex extent to neighbours in the tree, then + * tries to collapse the extent tree into the inode. + */ +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + (void) ext4_ext_try_to_merge_right(inode, path, ex); + + ext4_ext_try_to_merge_up(handle, inode, path); +} + +/* + * check if a portion of the "newext" extent overlaps with an + * existing extent. + * + * If there is an overlap discovered, it updates the length of the newext + * such that there will be no overlap, and then returns 1. + * If there is no overlap found, it returns 0. + */ +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) +{ + ext4_lblk_t b1, b2; + unsigned int depth, len1; + unsigned int ret = 0; + + b1 = le32_to_cpu(newext->ee_block); + len1 = ext4_ext_get_actual_len(newext); + depth = ext_depth(inode); + if (!path[depth].p_ext) + goto out; + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); + + /* + * get the next allocated block if the extent in the path + * is before the requested block(s) + */ + if (b2 < b1) { + b2 = ext4_ext_next_allocated_block(path); + if (b2 == EXT_MAX_BLOCKS) + goto out; + b2 = EXT4_LBLK_CMASK(sbi, b2); + } + + /* check for wrap through zero on extent logical start block*/ + if (b1 + len1 < b1) { + len1 = EXT_MAX_BLOCKS - b1; + newext->ee_len = cpu_to_le16(len1); + ret = 1; + } + + /* check for overlap */ + if (b1 + len1 > b2) { + newext->ee_len = cpu_to_le16(b2 - b1); + ret = 1; + } +out: + return ret; +} + +/* + * ext4_ext_insert_extent: + * tries to merge requested extent into the existing extent or + * inserts requested extent as new one into the tree, + * creating new leaf in the no-space case. + */ +struct ext4_ext_path * +ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) +{ + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; + struct ext4_extent *nearex; /* nearest extent */ + int depth, len, err = 0; + ext4_lblk_t next; + int mb_flags = 0, unwritten; + + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + err = -EFSCORRUPTED; + goto errout; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EFSCORRUPTED; + goto errout; + } + + /* try to insert block into found extent and return */ + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { + + /* + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. + */ + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug(inode, "append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + goto errout; + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + goto errout; + + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + nearex = ex; + goto merge; + } + } + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { + struct ext4_ext_path *npath; + + ext_debug(inode, "next leaf block - %u\n", next); + npath = ext4_find_extent(inode, next, NULL, gb_flags); + if (IS_ERR(npath)) { + err = PTR_ERR(npath); + goto errout; + } + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug(inode, "next leaf isn't full(%d)\n", + le16_to_cpu(eh->eh_entries)); + ext4_free_ext_path(path); + path = npath; + goto has_space; + } + ext_debug(inode, "next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext4_free_ext_path(npath); + } + + /* + * There is no free space in the found leaf. + * We're gonna add a new leaf in the tree. + */ + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags |= EXT4_MB_USE_RESERVED; + path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto errout; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext)); + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { + /* Insert after */ + ext_debug(inode, "insert %u:%llu:[%d]%d before: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug(inode, "insert %u:%llu:[%d]%d after: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug(inode, "insert %u:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); + } + } + + le16_add_cpu(&eh->eh_entries, 1); + path[depth].p_ext = nearex; + nearex->ee_block = newext->ee_block; + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents */ + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(handle, inode, path, nearex); + + /* time to correct all indexes above */ + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto errout; + + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; + + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +static int ext4_fill_es_cache_info(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + ext4_lblk_t next, end = block + num - 1; + struct extent_status es; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned int flags; + int err; + + while (block <= end) { + next = 0; + flags = 0; + if (!ext4_es_lookup_extent(inode, block, &next, &es)) + break; + if (ext4_es_is_unwritten(&es)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + if (ext4_es_is_delayed(&es)) + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ext4_es_is_hole(&es)) + flags |= EXT4_FIEMAP_EXTENT_HOLE; + if (next == 0) + flags |= FIEMAP_EXTENT_LAST; + if (flags & (FIEMAP_EXTENT_DELALLOC| + EXT4_FIEMAP_EXTENT_HOLE)) + es.es_pblk = 0; + else + es.es_pblk = ext4_es_pblock(&es); + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (next == 0) + break; + block = next; + if (err < 0) + return err; + if (err == 1) + return 0; + } + return 0; +} + + +/* + * ext4_ext_find_hole - find hole around given block according to the given path + * @inode: inode we lookup in + * @path: path in extent tree to @lblk + * @lblk: pointer to logical block around which we want to determine hole + * + * Determine hole length (and start if easily possible) around given logical + * block. We don't try too hard to find the beginning of the hole but @path + * actually points to extent before @lblk, we provide it. + * + * The function returns the length of a hole starting at @lblk. We update @lblk + * to the beginning of the hole if we managed to find it. + */ +static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + ext4_lblk_t len; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + *lblk = 0; + len = EXT_MAX_BLOCKS; + } else if (*lblk < le32_to_cpu(ex->ee_block)) { + len = le32_to_cpu(ex->ee_block) - *lblk; + } else if (*lblk >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; + + *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + next = ext4_ext_next_allocated_block(path); + BUG_ON(next == *lblk); + len = next - *lblk; + } else { + BUG(); + } + return len; +} + +/* + * ext4_ext_rm_idx: + * removes index from the index block. + */ +static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, int depth) +{ + int err; + ext4_fsblk_t leaf; + int k = depth - 1; + + /* free index block */ + leaf = ext4_idx_pblock(path[k].p_idx); + if (unlikely(path[k].p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k); + return -EFSCORRUPTED; + } + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + + if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) { + int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path[k].p_idx, path[k].p_idx + 1, len); + } + + le16_add_cpu(&path[k].p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + + ext4_free_blocks(handle, inode, NULL, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + + while (--k >= 0) { + if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + goto clean; + path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + goto clean; + } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); + + return err; +} + +/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. + */ +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, + struct ext4_ext_path *path) +{ + if (path) { + int depth = ext_depth(inode); + int ret = 0; + + /* probably there is space in leaf? */ + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) { + + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadata blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; + } + } + + return ext4_chunk_trans_blocks(inode, nrblocks); +} + +/* + * How many index/leaf blocks need to change/allocate to add @extents extents? + * + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. + * + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) +{ + int index; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; + + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ + if (extents <= 1) + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; + else + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); + + return index; +} + +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + +static int ext4_remove_blocks(handle_t *handle, struct inode *inode, + struct ext4_extent *ex, + struct partial_cluster *partial, + ext4_lblk_t from, ext4_lblk_t to) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); + + /* + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it + */ + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; + + /* + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. + */ + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_PBLK_CMASK(sbi, last_pblk), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); + } + + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; + + /* + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. + */ + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; + } + } else { + partial->state = initial; + } + + return 0; +} + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. + * @start: The first block to remove + * @end: The last block to remove + */ +static int +ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct partial_cluster *partial, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits, revoke_credits; + struct ext4_extent_header *eh; + ext4_lblk_t a, b; + unsigned num; + ext4_lblk_t ex_ee_block; + unsigned short ex_ee_len; + unsigned unwritten = 0; + struct ext4_extent *ex; + ext4_fsblk_t pblk; + + /* the header must be checked already in ext4_ext_remove_space() */ + ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EFSCORRUPTED; + } + /* find where to start removing */ + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_rm_leaf(inode, start, ex, partial); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; + else + unwritten = 0; + + ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, + unwritten, ex_ee_len); + path[depth].p_ext = ex; + + a = max(ex_ee_block, start); + b = min(ex_ee_block + ex_ee_len - 1, end); + + ext_debug(inode, " border %u:%u\n", a, b); + + /* If this extent is beyond the end of the hole, skip it */ + if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; + } + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); + err = -EFSCORRUPTED; + goto out; + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + num = a - ex_ee_block; + } else { + /* remove whole extent: excellent! */ + num = 0; + } + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + /* + * We may end up freeing some index blocks and data from the + * punched range. Note that partial clusters are accounted for + * by ext4_free_data_revoke_credits(). + */ + revoke_credits = + ext4_free_metadata_revoke_credits(inode->i_sb, + ext_depth(inode)) + + ext4_free_data_revoke_credits(inode, b - a + 1); + + err = ext4_datasem_ensure_credits(handle, inode, credits, + credits, revoke_credits); + if (err) { + if (err > 0) + err = -EAGAIN; + goto out; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); + if (err) + goto out; + + if (num == 0) + /* this extent is removed; mark slot entirely unused */ + ext4_ext_store_pblock(ex, 0); + + ex->ee_len = cpu_to_le16(num); + /* + * Do not mark unwritten if all the blocks in the + * extent have been removed. + */ + if (unwritten && num) + ext4_ext_mark_unwritten(ex); + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCKS - 1) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, + ext4_ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + } + + if (correct_index && eh->eh_entries) + err = ext4_ext_correct_indexes(handle, inode, path); + + /* + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If it is shared with the current extent + * we reset the partial cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. + */ + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext4_ext_rm_idx(handle, inode, path, depth); + +out: + return err; +} + +/* + * ext4_ext_more_to_rm: + * returns 1 if current index has to be freed (even partial) + */ +static int +ext4_ext_more_to_rm(struct ext4_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened, it wasn't partial, + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int depth = ext_depth(inode); + struct ext4_ext_path *path = NULL; + struct partial_cluster partial; + handle_t *handle; + int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; + + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + + ext_debug(inode, "truncate since %u to %u\n", start, end); + + /* probably first extent we're gonna free will be last in block */ + handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, + depth + 1, + ext4_free_metadata_revoke_credits(inode->i_sb, depth)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +again: + trace_ext4_ext_remove_space(inode, start, end, depth); + + /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; + + /* find extent for or closest extent to this block */ + path = ext4_find_extent(inode, end, NULL, flags); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ + ex = path[depth].p_ext; + if (!ex) { + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EFSCORRUPTED; + } + goto out; + } + + ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 1; + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. + */ + path = ext4_force_split_extent_at(handle, inode, path, + end + 1, 1); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && + partial.state == initial) { + /* + * If we're punching, there's an extent to the right. + * If the partial cluster hasn't been set, set it to + * that extent's first cluster and its state to nofree + * so it won't be freed should it contain blocks to be + * removed. If it's already set (tofree/nofree), we're + * retrying and keep the original partial cluster info + * so a cluster marked tofree as a result of earlier + * extent removal is not lost. + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + NULL, flags); + if (err < 0) + goto out; + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } + } + } + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ + depth = ext_depth(inode); + if (path) { + int k = i = depth; + while (--k > 0) + path[k].p_block = + le16_to_cpu(path[k].p_hdr->eh_entries)+1; + } else { + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), + GFP_NOFS | __GFP_NOFAIL); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_maxdepth = path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + i = 0; + + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { + err = -EFSCORRUPTED; + goto out; + } + } + err = 0; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext4_ext_rm_leaf(handle, inode, path, + &partial, start, end); + /* root level has p_bh == NULL, brelse() eats this */ + ext4_ext_path_brelse(path + i); + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug(inode, "initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't been touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we were already here, see at next index */ + path[i].p_idx--; + } + + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug(inode, "move to level %d (block %llu)\n", + i + 1, ext4_idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + bh = read_extent_tree_block(inode, path[i].p_idx, + depth - i - 1, flags); + if (IS_ERR(bh)) { + /* should we reset i_size? */ + err = PTR_ERR(bh); + break; + } + /* Yield here to deal with large extent trees. + * Should be a no-op if we did IO above. */ + cond_resched(); + if (WARN_ON(i + 1 > depth)) { + err = -EFSCORRUPTED; + break; + } + path[i + 1].p_bh = bh; + + /* save actual number of indexes since this + * number is changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finished processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it; + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext4_ext_rm_idx(handle, inode, path, i); + } + /* root level has p_bh == NULL, brelse() eats this */ + ext4_ext_path_brelse(path + i); + i--; + ext_debug(inode, "return to level %d\n", i); + } + } + + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); + + /* + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any + */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root(inode, 0)); + err = ext4_ext_dirty(handle, inode, path); + } + } +out: + ext4_free_ext_path(path); + path = NULL; + if (err == -EAGAIN) + goto again; + ext4_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext4_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (ext4_has_feature_extents(sb)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled" +#ifdef AGGRESSIVE_TEST + ", aggressive tests" +#endif +#ifdef CHECK_BINSEARCH + ", check binsearch" +#endif +#ifdef EXTENTS_STATS + ", stats" +#endif + "\n"); +#endif +#ifdef EXTENTS_STATS + spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); + EXT4_SB(sb)->s_ext_min = 1 << 30; + EXT4_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext4_ext_release(struct super_block *sb) +{ + if (!ext4_has_feature_extents(sb)) + return; + +#ifdef EXTENTS_STATS + if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return; + + ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN, false); +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, + ee_len); +} + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or unwritten) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are determined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * Return an extent path pointer on success, or an error pointer on failure. + */ +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags) +{ + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex, zero_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == + (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + + ext_debug(inode, "logical block %llu\n", (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_unwritten(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(handle, inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); + + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!IS_ERR(path)) + goto out; + + err = PTR_ERR(path); + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + return path; + + /* + * Get a new path to try to zeroout or fix the extent length. + * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() + * will not return -ENOMEM, otherwise -ENOMEM will cause a + * retry in do_writepages(), and a WARN_ON may be triggered + * in ext4_da_update_reserve_space() due to an incorrect + * ee_len causing the i_reserved_data_blocks exception. + */ + path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); + if (IS_ERR(path)) { + EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", + split, PTR_ERR(path)); + return path; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (EXT4_EXT_MAY_ZEROOUT & split_flag) { + if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { + if (split_flag & EXT4_EXT_DATA_VALID1) { + err = ext4_ext_zeroout(inode, ex2); + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { + err = ext4_ext_zeroout(inode, ex); + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { + err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } + + if (!err) { + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (!err) + /* update extent status tree */ + ext4_zeroout_es(inode, &zero_ex); + /* If we failed at this point, we don't know in which + * state the extent tree exactly is so don't try to fix + * length of the original extent as it may do even more + * damage. + */ + goto out; + } + } + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + /* + * Ignore ext4_ext_dirty return value since we are already in error path + * and err is a non-zero error code. + */ + ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + if (err) { + ext4_free_ext_path(path); + path = ERR_PTR(err); + } + ext4_ext_show_leaf(inode, path); + return path; +} + +/* + * ext4_split_extent() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (up to three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static struct ext4_ext_path *ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, int flags, + unsigned int *allocated) +{ + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int unwritten; + int split_flag1, flags1; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + unwritten = ext4_ext_is_unwritten(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + if (split_flag & EXT4_EXT_DATA_VALID2) + split_flag1 |= EXT4_EXT_DATA_VALID1; + path = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (IS_ERR(path)) + return path; + /* + * Update path is required because previous ext4_split_extent_at + * may result in split of original leaf or extent zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + ext4_free_ext_path(path); + return ERR_PTR(-EFSCORRUPTED); + } + unwritten = ext4_ext_is_unwritten(ex); + } + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT2); + } + path = ext4_split_extent_at(handle, inode, path, + map->m_lblk, split_flag1, flags); + if (IS_ERR(path)) + return path; + } + + if (allocated) { + if (map->m_lblk + map->m_len > ee_block + ee_len) + *allocated = ee_len - (map->m_lblk - ee_block); + else + *allocated = map->m_len; + } + ext4_ext_show_leaf(inode, path); + return path; +} + +/* + * This function is called by ext4_ext_map_blocks() if someone tries to write + * to an unwritten extent. It may result in splitting the unwritten + * extent into multiple extents (up to three - one initialized and two + * unwritten). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is unwritten. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. + */ +static struct ext4_ext_path * +ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) +{ + struct ext4_sb_info *sbi; + struct ext4_extent_header *eh; + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex1, zero_ex2; + struct ext4_extent *ex, *abut_ex; + ext4_lblk_t ee_block, eof_block; + unsigned int ee_len, depth, map_len = map->m_len; + int err = 0; + int split_flag = EXT4_EXT_DATA_VALID2; + unsigned int max_zeroout = 0; + + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)map->m_lblk, map_len); + + sbi = EXT4_SB(inode->i_sb); + eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) + >> inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; + + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_unwritten(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + + /* + * Attempt to transfer newly initialized blocks from the currently + * unwritten extent to its neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. + * + * Limitations of the current logic: + * - L1: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L2: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + *allocated = 0; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len; + + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto errout; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + *allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto errout; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + *allocated = map_len; + } + } + if (*allocated) { + /* Mark the block containing both extents as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + if (err) + goto errout; + goto out; + } else + *allocated = ee_len - (map->m_lblk - ee_block); + + WARN_ON(map->m_lblk < ee_block); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + + /* + * five cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. + * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. + */ + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (max_zeroout && (*allocated > split_map.m_len)) { + if (*allocated <= max_zeroout) { + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(*allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); + if (err) + goto fallback; + split_map.m_len = *allocated; + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex2, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex2); + if (err) + goto fallback; + } + + split_map.m_len += split_map.m_lblk - ee_block; + split_map.m_lblk = ee_block; + *allocated = map->m_len; + } + } + +fallback: + path = ext4_split_extent(handle, inode, path, &split_map, split_flag, + flags, NULL); + if (IS_ERR(path)) + return path; +out: + /* If we have gotten a failure, don't zero out status tree */ + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +/* + * This function is called by ext4_ext_map_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an unwritten extent. + * + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be unwritten + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * This works the same way in the case of initialized -> unwritten conversion. + * + * One of more index blocks maybe needed if the extent tree grow after + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + * + * The size of unwritten extent to be written is passed to the caller via the + * allocated pointer. Return an extent path pointer on success, or an error + * pointer on failure. + */ +static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + int flags, unsigned int *allocated) +{ + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; + + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) + >> inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags, + allocated); +} + +static struct ext4_ext_path * +ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)ee_block, ee_len); + + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ + if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef CONFIG_EXT4_DEBUG + ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT, NULL); + if (IS_ERR(path)) + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto errout; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; + + ext4_ext_show_leaf(inode, path); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +static struct ext4_ext_path * +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + unsigned int *allocated) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); + if (IS_ERR(path)) + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EFSCORRUPTED; + goto errout; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto errout; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +static struct ext4_ext_path * +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int *allocated, ext4_fsblk_t newblock) +{ + int err = 0; + + ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", + (unsigned long long)map->m_lblk, map->m_len, flags, + *allocated); + ext4_ext_show_leaf(inode, path); + + /* + * When writing into unwritten space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, + *allocated, newblock); + + /* get_block() before submitting IO, split the extent */ + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + path = ext4_split_convert_extents(handle, inode, map, path, + flags | EXT4_GET_BLOCKS_CONVERT, allocated); + if (IS_ERR(path)) + return path; + /* + * shouldn't get a 0 allocated when splitting an extent unless + * m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(*allocated == 0)) { + EXT4_ERROR_INODE(inode, + "unexpected allocated == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto errout; + } + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out; + } + /* IO end_io complete, convert the filled extent to written */ + if (flags & EXT4_GET_BLOCKS_CONVERT) { + path = ext4_convert_unwritten_extents_endio(handle, inode, + map, path); + if (IS_ERR(path)) + return path; + ext4_update_inode_fsync_trans(handle, inode, 1); + goto map_out; + } + /* buffered IO cases */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto map_out; + } + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out1; + } + + /* + * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. + * For buffered writes, at writepage time, etc. Convert a + * discovered unwritten extent to written. + */ + path = ext4_ext_convert_to_initialized(handle, inode, map, path, + flags, allocated); + if (IS_ERR(path)) + return path; + ext4_update_inode_fsync_trans(handle, inode, 1); + /* + * shouldn't get a 0 allocated when converting an unwritten extent + * unless m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(*allocated == 0)) { + EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto errout; + } + +out: + map->m_flags |= EXT4_MAP_NEW; +map_out: + map->m_flags |= EXT4_MAP_MAPPED; +out1: + map->m_pblk = newblock; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + ext4_ext_show_leaf(inode, path); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); +} + +/* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sb The filesystem superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); + return 1; + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); + return 0; +} + +/* + * Determine hole length around the given logical block, first try to + * locate and expand the hole from the given @path, and then adjust it + * if it's partially or completely converted to delayed extents, insert + * it into the extent cache tree if it's indeed a hole, finally return + * the length of the determined extent. + */ +static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t lblk) +{ + ext4_lblk_t hole_start, len; + struct extent_status es; + + hole_start = lblk; + len = ext4_ext_find_hole(inode, path, &hole_start); +again: + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + len - 1, &es); + if (!es.es_len) + goto insert_hole; + + /* + * There's a delalloc extent in the hole, handle it if the delalloc + * extent is in front of, behind and straddle the queried range. + */ + if (lblk >= es.es_lblk + es.es_len) { + /* + * The delalloc extent is in front of the queried range, + * find again from the queried start block. + */ + len -= lblk - hole_start; + hole_start = lblk; + goto again; + } else if (in_range(lblk, es.es_lblk, es.es_len)) { + /* + * The delalloc extent containing lblk, it must have been + * added after ext4_map_blocks() checked the extent status + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. + */ + len = es.es_lblk + es.es_len - lblk; + return len; + } else { + /* + * The delalloc extent is partially or completely behind + * the queried range, update hole length until the + * beginning of the delalloc extent. + */ + len = min(es.es_lblk - hole_start, len); + } + +insert_hole: + /* Put just found gap into cache to speed up subsequent requests */ + ext_debug(inode, " -> %u:%u\n", hole_start, len); + ext4_es_insert_extent(inode, hole_start, len, ~0, + EXTENT_STATUS_HOLE, false); + + /* Update hole_len to reflect hole size after lblk */ + if (hole_start != lblk) + len -= lblk - hole_start; + + return len; +} + +/* + * Block allocation/map/preallocation routine for extents based files + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of blocks already mapped/allocated + * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. + */ +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent newex, *ex, ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_fsblk_t newblock = 0, pblk; + int err = 0, depth; + unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0; + struct ext4_allocation_request ar; + ext4_lblk_t cluster_offset; + + ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + + /* find extent for this block */ + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty; + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_find_extent() + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + ex = path[depth].p_ext; + if (ex) { + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len; + + + /* + * unwritten extents are treated as holes, except that + * we split out initialized portions during a write. + */ + ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + + /* if found extent covers block, simply return it */ + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; + /* number of remaining blocks in the extent */ + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug(inode, "%u fit into %u:%d -> %llu\n", + map->m_lblk, ee_block, ee_len, newblock); + + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + path = convert_initialized_extent(handle, + inode, map, path, &allocated); + if (IS_ERR(path)) + err = PTR_ERR(path); + goto out; + } else if (!ext4_ext_is_unwritten(ex)) { + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + ext4_ext_show_leaf(inode, path); + goto out; + } + + path = ext4_ext_handle_unwritten_extents( + handle, inode, map, path, flags, + &allocated, newblock); + if (IS_ERR(path)) + err = PTR_ERR(path); + goto out; + } + } + + /* + * requested block isn't allocated yet; + * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE + */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + ext4_lblk_t len; + + len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); + + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, len); + goto out; + } + + /* + * Okay, we need to do block allocation. + */ + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + goto got_allocated_blocks; + } + + /* find neighbour allocated blocks */ + ar.lleft = map->m_lblk; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out; + ar.lright = map->m_lblk; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); + if (err < 0) + goto out; + + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && err && + get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + err = 0; + goto got_allocated_blocks; + } + + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. + */ + if (map->m_len > EXT_INIT_MAX_LEN && + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; + + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_len = cpu_to_le16(map->m_len); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); + if (err) + allocated = ext4_ext_get_actual_len(&newex); + else + allocated = map->m_len; + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out; + allocated_clusters = ar.len; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", + ar.goal, newblock, ar.len, allocated); + if (ar.len > allocated) + ar.len = allocated; + +got_allocated_blocks: + /* try to insert new extent into found leaf and return */ + pblk = newblock + offset; + ext4_ext_store_pblock(&newex, pblk); + newex.ee_len = cpu_to_le16(ar.len); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { + ext4_ext_mark_unwritten(&newex); + map->m_flags |= EXT4_MAP_UNWRITTEN; + } + + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); + if (allocated_clusters) { + int fb_flags = 0; + + /* + * free data blocks we just allocated. + * not a good idea to call discard here directly, + * but otherwise we'd need to call it every free(). + */ + ext4_discard_preallocations(inode); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), + fb_flags); + } + goto out; + } + + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an unwritten extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) + ext4_update_inode_fsync_trans(handle, inode, 1); + else + ext4_update_inode_fsync_trans(handle, inode, 0); + + map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); + map->m_pblk = pblk; + map->m_len = ar.len; + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); +out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + + ext4_free_ext_path(path); + + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); + return err ? err : allocated; +} + +int ext4_ext_truncate(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t last_block; + int err = 0; + + /* + * TODO: optimization is possible here. + * Probably we need not scan at all, + * because page truncation is enough. + */ + + /* we have to know where to truncate from in crash case */ + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); + if (err) + return err; + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT4_BLOCK_SIZE_BITS(sb); + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + +retry_remove_space: + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_ATOMIC); + goto retry_remove_space; + } + return err; +} + +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, loff_t new_size, + int flags) +{ + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0, ret2 = 0, ret3 = 0; + int retries = 0; + int depth = 0; + struct ext4_map_blocks map; + unsigned int credits; + loff_t epos, old_size = i_size_read(inode); + unsigned int blkbits = inode->i_blkbits; + bool alloc_zero = false; + + BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); + map.m_lblk = offset; + map.m_len = len; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * Do the actual write zero during a running journal transaction + * costs a lot. First allocate an unwritten extent and then + * convert it to written after zeroing it out. + */ + if (flags & EXT4_GET_BLOCKS_ZERO) { + flags &= ~EXT4_GET_BLOCKS_ZERO; + flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; + alloc_zero = true; + } + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + +retry: + while (len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + break; + } + /* + * allow a full retry cycle for any remaining allocations + */ + retries = 0; + epos = (loff_t)(map.m_lblk + ret) << blkbits; + inode_set_ctime_current(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode_set_mtime_to_ts(inode, + inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } + } + ret2 = ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + if (unlikely(ret2)) + break; + + if (alloc_zero && + (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { + ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret2)) + ret2 = ext4_convert_unwritten_extents(NULL, + inode, (loff_t)map.m_lblk << blkbits, + (loff_t)map.m_len << blkbits); + if (ret2) + break; + } + + map.m_lblk += ret; + map.m_len = len = len - ret; + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret > 0 ? ret2 : ret; +} + +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); + +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + loff_t new_size = 0; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); + unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; + + trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; + } + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + /* Preallocate the range including the unaligned edges */ + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; + } + + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; + + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; + + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + if (mode & FALLOC_FL_WRITE_ZEROES) + flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; + else + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); + if (ret) + return ret; + } + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; + + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + return ret; + } + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + + if (new_size) + ext4_update_inode_size(inode, new_size); + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + +out_handle: + ext4_journal_stop(handle); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); + return ret; +} + +/* + * preallocate space for a file. This implements ext4's fallocate file + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + int ret; + + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + */ + if (IS_ENCRYPTED(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the underlying device does not + * enable the unmap write zeroes operation. + */ + if ((mode & FALLOC_FL_WRITE_ZEROES) && + !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev)) + return -EOPNOTSUPP; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES)) + return -EOPNOTSUPP; + + inode_lock(inode); + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_inode_lock; + + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); + + ret = file_modified(file); + if (ret) + goto out_inode_lock; + + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; + } + + /* + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. + */ + filemap_invalidate_lock(mapping); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_invalidate_lock; + + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: + ret = ext4_punch_hole(file, offset, len); + break; + case FALLOC_FL_COLLAPSE_RANGE: + ret = ext4_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + ret = ext4_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_WRITE_ZEROES: + ret = ext4_zero_range(file, offset, len, mode); + break; + default: + ret = -EOPNOTSUPP; + } + +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: + inode_unlock(inode); + return ret; +} + +/* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } + + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; + } + + ret2 = ext4_mark_inode_dirty(handle, inode); + + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); + + return ret > 0 ? ret2 : ret; +} + +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + + map.m_lblk = offset >> blkbits; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + + if (!handle) { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); + if (ret <= 0) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + + if (ret <= 0 || ret2) + break; + } + return ret > 0 ? ret2 : ret; +} + +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret = 0, err = 0; + struct ext4_io_end_vec *io_end_vec; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + +static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) +{ + __u64 physical = 0; + __u64 length = 0; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + u16 iomap_type; + + /* in-inode? */ + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = (__u64)iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + brelse(iloc.bh); + iomap_type = IOMAP_INLINE; + } else if (EXT4_I(inode)->i_file_acl) { /* external block */ + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + iomap_type = IOMAP_MAPPED; + } else { + /* no in-inode or external block for xattr, so return -ENOENT */ + error = -ENOENT; + goto out; + } + + iomap->addr = physical; + iomap->offset = 0; + iomap->length = length; + iomap->type = iomap_type; + iomap->flags = 0; +out: + return error; +} + +static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int error; + + error = ext4_iomap_xattr_fiemap(inode, iomap); + if (error == 0 && (offset >= iomap->length)) + error = -ENOENT; + return error; +} + +static const struct iomap_ops ext4_iomap_xattr_ops = { + .iomap_begin = ext4_iomap_xattr_begin, +}; + +static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) +{ + u64 maxbytes = ext4_get_maxbytes(inode); + + if (*len == 0) + return -EINVAL; + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + return 0; +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + int error = 0; + + inode_lock_shared(inode); + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + goto unlock; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; + } + + /* + * For bitmap files the maximum size limit could be smaller than + * s_maxbytes, so check len here manually instead of just relying on the + * generic check. + */ + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + goto unlock; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); + } +unlock: + inode_unlock_shared(inode); + return error; +} + +int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk, len_blks; + __u64 last_blk; + int error = 0; + + if (ext4_has_inline_data(inode)) { + int has_inline; + + down_read(&EXT4_I(inode)->xattr_sem); + has_inline = ext4_has_inline_data(inode); + up_read(&EXT4_I(inode)->xattr_sem); + if (has_inline) + return 0; + } + + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); + error = ext4_ext_precache(inode); + inode_unlock_shared(inode); + if (error) + return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; + } + + error = fiemap_prep(inode, fieinfo, start, &len, 0); + if (error) + return error; + + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + enum SHIFT_DIRECTION SHIFT) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = false; + int credits, restart_credits; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EFSCORRUPTED; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + /* leaf + sb + inode */ + credits = 3; + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { + update = true; + /* extent tree + sb + inode */ + credits = depth + 2; + } + + restart_credits = ext4_chunk_trans_extent(inode, 0); + err = ext4_datasem_ensure_credits(handle, inode, credits, + restart_credits, 0); + if (err) { + if (err > 0) + err = -EAGAIN; + goto out; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + while (ex_start <= ex_last) { + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); + ex_last--; + } + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop, *iterator, ex_start, ex_end; + ext4_lblk_t tmp = EXT_MAX_BLOCKS; + + /* Let path point to the last extent */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) + goto out; + + stop = le32_to_cpu(extent->ee_block); + + /* + * For left shifts, make sure the hole on the left is big enough to + * accommodate the shift. For right shifts, make sure the last extent + * won't be shifted beyond EXT_MAX_BLOCKS. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, path, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ret = -EINVAL; + goto out; + } + } else { + if (shift > EXT_MAX_BLOCKS - + (stop + ext4_ext_get_actual_len(extent))) { + ret = -EINVAL; + goto out; + } + } + + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ +again: + ret = 0; + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; + + if (tmp != EXT_MAX_BLOCKS) + *iterator = tmp; + + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, path, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) *iterator); + return -EFSCORRUPTED; + } + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { + /* Hole, move to the next extent */ + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + *iterator = ext4_ext_next_allocated_block(path); + continue; + } + } + + tmp = *iterator; + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + if (le32_to_cpu(extent->ee_block) > start) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else if (le32_to_cpu(extent->ee_block) == start) + iterator = NULL; + else { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + while (le32_to_cpu(extent->ee_block) >= start) + extent--; + + if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) + break; + + extent++; + iterator = NULL; + } + path[depth].p_ext = extent; + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, SHIFT); + /* iterator can be NULL which means we should break */ + if (ret == -EAGAIN) + goto again; + if (ret) + break; + } +out: + ext4_free_ext_path(path); + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + handle_t *handle; + unsigned int credits; + loff_t start, new_size; + int ret; + + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + /* Collapse range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) + return -EINVAL; + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (end >= inode->i_size) + return -EINVAL; + + /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, start); + + credits = ext4_chunk_trans_extent(inode, 0); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); + + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } + + new_size = inode->i_size - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_handle: + ext4_journal_stop(handle); + return ret; +} + +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret, depth, split_flag = 0; + loff_t start; + + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + /* Insert range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) + return -EINVAL; + /* Offset must be less than i_size */ + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; + + /* + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. + */ + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, start); + + credits = ext4_chunk_trans_extent(inode, 0); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, start_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + ret = PTR_ERR(path); + goto out_handle; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk + */ + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + path = ext4_split_extent_at(handle, inode, path, + start_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + ret = PTR_ERR(path); + goto out_handle; + } + } + + ext4_free_ext_path(path); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); + + /* + * if start_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); + up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_handle: + ext4_journal_stop(handle); + return ret; +} + +/** + * ext4_swap_extents() - Swap extents between two inodes + * @handle: handle for this transaction + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_rwsem is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); + + ext4_es_remove_extent(inode1, lblk1, count); + ext4_es_remove_extent(inode2, lblk2, count); + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } + path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have something to swap ? */ + if (unlikely(!ex2 || !ex1)) + goto errout; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e2_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + goto errout; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + continue; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } + } + if (e2_blk < lblk2) { + split = 1; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + continue; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1 + len, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } + } + if (len != e2_len) { + split = 1; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2 + len, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + continue; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (unlikely(*erp)) + goto errout; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (unlikely(*erp)) + goto errout; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (unlikely(*erp)) + goto errout; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (unlikely(*erp)) + goto errout; + + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + } + +errout: + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); + return replaced_count; +} + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* + * if data can be stored inline, the logical cluster isn't + * mapped - no physical clusters have been allocated, and the + * file has no extents + */ + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || + ext4_has_inline_data(inode)) + return 0; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_free_ext_path(path); + + return err ? err : mapped; +} + +/* + * Updates physical block address and unwritten status of extent + * starting at lblk start and of len. If such an extent doesn't exist, + * this function splits the extent tree appropriately to create an + * extent like this. This function is called in the fast commit + * replay path. Returns 0 on success and error on failure. + */ +int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk) +{ + struct ext4_ext_path *path; + struct ext4_extent *ex; + int ret; + + path = ext4_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + if (!ex) { + ret = -EFSCORRUPTED; + goto out; + } + + if (le32_to_cpu(ex->ee_block) != start || + ext4_ext_get_actual_len(ex) != len) { + /* We need to split this extent to match our extent first */ + down_write(&EXT4_I(inode)->i_data_sem); + path = ext4_force_split_extent_at(NULL, inode, path, start, 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto out; + } + + path = ext4_find_extent(inode, start, path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + ex = path[path->p_depth].p_ext; + WARN_ON(le32_to_cpu(ex->ee_block) != start); + + if (ext4_ext_get_actual_len(ex) != len) { + down_write(&EXT4_I(inode)->i_data_sem); + path = ext4_force_split_extent_at(NULL, inode, path, + start + len, 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto out; + } + + path = ext4_find_extent(inode, start, path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + } + } + if (unwritten) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + ext4_ext_store_pblock(ex, pblk); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); + up_write(&EXT4_I(inode)->i_data_sem); +out: + ext4_free_ext_path(path); + ext4_mark_inode_dirty(NULL, inode); + return ret; +} + +/* Try to shrink the extent tree */ +void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t old_cur, cur = 0; + + while (cur < end) { + path = ext4_find_extent(inode, cur, NULL, 0); + if (IS_ERR(path)) + return; + ex = path[path->p_depth].p_ext; + if (!ex) { + ext4_free_ext_path(path); + ext4_mark_inode_dirty(NULL, inode); + return; + } + old_cur = cur; + cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + if (cur <= old_cur) + cur = old_cur + 1; + ext4_ext_try_to_merge(NULL, inode, path, ex); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_dirty(NULL, inode, &path[path->p_depth]); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_mark_inode_dirty(NULL, inode); + ext4_free_ext_path(path); + } +} + +/* Check if *cur is a hole and if it is, skip it */ +static int skip_hole(struct inode *inode, ext4_lblk_t *cur) +{ + int ret; + struct ext4_map_blocks map; + + map.m_lblk = *cur; + map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret != 0) + return 0; + *cur = *cur + map.m_len; + return 0; +} + +/* Count number of blocks used by this inode and update i_blocks */ +int ext4_ext_replay_set_iblocks(struct inode *inode) +{ + struct ext4_ext_path *path = NULL, *path2 = NULL; + struct ext4_extent *ex; + ext4_lblk_t cur = 0, end; + int numblks = 0, i, ret = 0; + ext4_fsblk_t cmp1, cmp2; + struct ext4_map_blocks map; + + /* Determin the size of the file first */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + if (!ex) + goto out; + end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + + /* Count the number of data blocks */ + cur = 0; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + if (ret > 0) + numblks += ret; + cur = cur + map.m_len; + } + + /* + * Count the number of extent tree blocks. We do it by looking up + * two successive extents and determining the difference between + * their paths. When path is different for 2 successive extents + * we compare the blocks in the path at each level and increment + * iblocks by total number of differences found. + */ + cur = 0; + ret = skip_hole(inode, &cur); + if (ret < 0) + goto out; + path = ext4_find_extent(inode, cur, path, 0); + if (IS_ERR(path)) + goto out; + numblks += path->p_depth; + while (cur < end) { + path = ext4_find_extent(inode, cur, path, 0); + if (IS_ERR(path)) + break; + ex = path[path->p_depth].p_ext; + if (!ex) + goto cleanup; + + cur = max(cur + 1, le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)); + ret = skip_hole(inode, &cur); + if (ret < 0) + break; + + path2 = ext4_find_extent(inode, cur, path2, 0); + if (IS_ERR(path2)) + break; + + for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { + cmp1 = cmp2 = 0; + if (i <= path->p_depth) + cmp1 = path[i].p_bh ? + path[i].p_bh->b_blocknr : 0; + if (i <= path2->p_depth) + cmp2 = path2[i].p_bh ? + path2[i].p_bh->b_blocknr : 0; + if (cmp1 != cmp2 && cmp2 != 0) + numblks++; + } + } + +out: + inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); + ext4_mark_inode_dirty(NULL, inode); +cleanup: + ext4_free_ext_path(path); + ext4_free_ext_path(path2); + return 0; +} + +int ext4_ext_clear_bb(struct inode *inode) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t cur = 0, end; + int j, ret = 0; + struct ext4_map_blocks map; + + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) + return 0; + + /* Determin the size of the file first */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + if (!ex) + goto out; + end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + + cur = 0; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + if (ret > 0) { + path = ext4_find_extent(inode, map.m_lblk, path, 0); + if (!IS_ERR(path)) { + for (j = 0; j < path->p_depth; j++) { + ext4_mb_mark_bb(inode->i_sb, + path[j].p_block, 1, false); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + 0, path[j].p_block, 1, 1); + } + } else { + path = NULL; + } + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + map.m_lblk, map.m_pblk, map.m_len, 1); + } + cur = cur + map.m_len; + } + +out: + ext4_free_ext_path(path); + return 0; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy inode.c from Linux v6.18 fs/ext4 directory. This file implements inode operations including reading, writing, and truncating files. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/inode.c | 6756 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 6756 insertions(+) create mode 100644 fs/ext4l/inode.c diff --git a/fs/ext4l/inode.c b/fs/ext4l/inode.c new file mode 100644 index 00000000000..e99306a8f47 --- /dev/null +++ b/fs/ext4l/inode.c @@ -0,0 +1,6756 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/time.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/dax.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/rmap.h> +#include <linux/namei.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/iomap.h> +#include <linux/iversion.h> + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "truncate.h" + +#include <trace/events/ext4.h> + +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + +static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + __u16 dummy_csum = 0; + int offset = offsetof(struct ext4_inode, i_checksum_lo); + unsigned int csum_size = sizeof(dummy_csum); + + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); + offset += csum_size; + csum = ext4_chksum(csum, (__u8 *)raw + offset, + EXT4_GOOD_OLD_INODE_SIZE - offset); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + offset = offsetof(struct ext4_inode, i_checksum_hi); + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, + offset - EXT4_GOOD_OLD_INODE_SIZE); + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, + csum_size); + offset += csum_size; + } + csum = ext4_chksum(csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); + } + + return csum; +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 provided, calculated; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !ext4_has_feature_metadata_csum(inode->i_sb)) + return 1; + + provided = le16_to_cpu(raw->i_checksum_lo); + calculated = ext4_inode_csum(inode, raw, ei); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; + else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + csum = ext4_inode_csum(inode, raw, ei); + raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum >> 16); +} + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + trace_ext4_begin_ordered_truncate(inode, new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); +} + +/* + * Test whether an inode is a fast symlink. + * A fast symlink has its symlink data stored in ext4_inode_info->i_data. + */ +int ext4_inode_is_fast_symlink(struct inode *inode) +{ + if (!ext4_has_feature_ea_inode(inode->i_sb)) { + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + } + return S_ISLNK(inode->i_mode) && inode->i_size && + (inode->i_size < EXT4_N_BLOCKS * 4); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_evict_inode(struct inode *inode) +{ + handle_t *handle; + int err; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; + struct ext4_xattr_inode_array *ea_inode_array = NULL; + bool freeze_protected = false; + + trace_ext4_evict_inode(inode); + + dax_break_layout_final(inode); + + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) + ext4_evict_ea_inode(inode); + if (inode->i_nlink) { + truncate_inode_pages_final(&inode->i_data); + + goto no_delete; + } + + if (is_bad_inode(inode)) + goto no_delete; + dquot_initialize(inode); + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages_final(&inode->i_data); + + /* + * For inodes with journalled data, transaction commit could have + * dirtied the inode. And for inodes with dioread_nolock, unwritten + * extents converting worker could merge extents and also have dirtied + * the inode. Flush worker is ignoring it because of I_FREEING flag but + * we still need to remove the inode from the writeback lists. + */ + if (!list_empty_careful(&inode->i_io_list)) + inode_io_list_del(inode); + + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it. When we are in a running transaction though, + * we are already protected against freezing and we cannot grab further + * protection due to lock ordering constraints. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(inode->i_sb); + freeze_protected = true; + } + + if (!IS_NOQUOTA(inode)) + extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode) + extra_credits - 3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); + goto no_delete; + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + /* + * Set inode->i_size to 0 before calling ext4_truncate(). We need + * special handling of symlinks here because i_size is used to + * determine whether ext4_inode_info->i_data contains symlink data or + * block mappings. Setting i_size to 0 will remove its fast symlink + * status. Erase i_data so that it becomes a valid empty block map. + */ + if (ext4_inode_is_fast_symlink(inode)) + memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } + if (inode->i_blocks) { + err = ext4_truncate(inode); + if (err) { + ext4_error_err(inode->i_sb, -err, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); + goto stop_handle; + } + } + + /* Remove xattr references. */ + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, + extra_credits); + if (err) { + ext4_warning(inode->i_sb, "xattr delete (err %d)", err); +stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + goto no_delete; + } + + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + return; +no_delete: + /* + * Check out some where else accidentally dirty the evicting inode, + * which may probably cause inode use-after-free issues later. + */ + WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); + + if (!list_empty(&EXT4_I(inode)->i_fc_list)) + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) +{ + return &EXT4_I(inode)->i_reserved_quota; +} +#endif + +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_warning(inode->i_sb, "%s: ino %lu, used %d " + "with only %d reserved data blocks", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); + + spin_unlock(&ei->i_block_reservation_lock); + + /* Update quota subsystem for data blocks */ + if (quota_claim) + dquot_claim_block(inode, EXT4_C2B(sbi, used)); + else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not re-claim the quota for fallocated blocks. + */ + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); + } + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if ((ei->i_reserved_data_blocks == 0) && + !inode_is_open_for_write(inode)) + ext4_discard_preallocations(inode); +} + +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, + struct ext4_map_blocks *map) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) + return 0; + + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock %llu " + "(length %d)", (unsigned long) map->m_lblk, + map->m_pblk, map->m_len); + return -EFSCORRUPTED; + } + return 0; +} + +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + ext4_lblk_t len) +{ + int ret; + + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) + return fscrypt_zeroout_range(inode, lblk, pblk, len); + + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, 0); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, 0); + } + up_read((&EXT4_I(inode)->i_data_sem)); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertion failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + unsigned int status; + int retval; + unsigned int orig_mlen = map->m_len; + + flags &= EXT4_EX_QUERY_FILTER; + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, flags); + else + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + + return retval; +} + +/* + * The ext4_map_blocks() function tries to look up the requested blocks, + * and returns if the blocks are already mapped. + * + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that case, @map is returned as unmapped but we still do fill map->m_len to + * indicate the length of a hole starting at map->m_lblk. + * + * It returns the error in case of allocation failure. + */ +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + int retval; + int ret = 0; + unsigned int orig_mlen = map->m_len; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif + + map->m_flags = 0; + ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", + flags, map->m_len, (unsigned long) map->m_lblk); + + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EFSCORRUPTED; + + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + retval = 0; + } else { + BUG(); + } + + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return retval; +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + map->m_len = orig_mlen; + } + /* + * In the query cache no-wait mode, nothing we can do more if we + * cannot find extent in the cache. + */ + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return 0; + + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read(&EXT4_I(inode)->i_data_sem); + retval = ext4_map_query_blocks(handle, inode, map, flags); + up_read((&EXT4_I(inode)->i_data_sem)); + +found: + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + + /* If it is only a block(s) look up */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_map_blocks() returns with buffer head unmapped + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; + + + ext4_fc_track_inode(handle, inode); + /* + * New blocks allocate and/or writing to unwritten extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_block() + * with create == 1 flag. + */ + down_write(&EXT4_I(inode)->i_data_sem); + retval = ext4_map_create_blocks(handle, inode, map, flags); + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + + /* + * Inodes with freshly allocated blocks where contents will be + * visible after transaction commit must be on transaction's + * ordered data list. + */ + if (map->m_flags & EXT4_MAP_NEW && + !(map->m_flags & EXT4_MAP_UNWRITTEN) && + !(flags & EXT4_GET_BLOCKS_ZERO) && + !ext4_is_quota_file(inode) && + ext4_should_order_data(inode)) { + loff_t start_byte = + (loff_t)map->m_lblk << inode->i_blkbits; + loff_t length = (loff_t)map->m_len << inode->i_blkbits; + + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode, + start_byte, length); + else + ret = ext4_jbd2_inode_add_write(handle, inode, + start_byte, length); + if (ret) + return ret; + } + } + if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || + map->m_flags & EXT4_MAP_MAPPED)) + ext4_fc_track_range(handle, inode, map->m_lblk, + map->m_lblk + map->m_len - 1); + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + return retval; +} + +/* + * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages + * we have to be careful as someone else may be manipulating b_state as well. + */ +static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) +{ + unsigned long old_state; + unsigned long new_state; + + flags &= EXT4_MAP_FLAGS; + + /* Dummy buffer_head? Set non-atomically. */ + if (!bh->b_folio) { + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; + return; + } + /* + * Someone else may be modifying b_state. Be careful! This is ugly but + * once we get rid of using bh as a container for mapping information + * to pass to / from get_block functions, this can go away. + */ + old_state = READ_ONCE(bh->b_state); + do { + new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; + } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); +} + +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) +{ + struct ext4_map_blocks map; + int ret = 0; + + if (ext4_has_inline_data(inode)) + return -ERANGE; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, + flags); + if (ret > 0) { + map_bh(bh, inode->i_sb, map.m_pblk); + ext4_update_bh_state(bh, map.m_flags); + bh->b_size = inode->i_sb->s_blocksize * map.m_len; + ret = 0; + } else if (ret == 0) { + /* hole case, need to fill in bh->b_size */ + bh->b_size = inode->i_sb->s_blocksize * map.m_len; + } + return ret; +} + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * Get block function used when preparing for buffered write if we require + * creating an unwritten extent if blocks haven't been allocated. The extent + * will be converted to written after the IO is complete. + */ +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + + /* + * If the buffer is marked unwritten, mark it as new to make sure it is + * zeroed out correctly in case of partial writes. Otherwise, there is + * a chance of stale data getting exposed. + */ + if (ret == 0 && buffer_unwritten(bh_result)) + set_buffer_new(bh_result); + + return ret; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int map_flags) +{ + struct ext4_map_blocks map; + struct buffer_head *bh; + int create = map_flags & EXT4_GET_BLOCKS_CREATE; + bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; + int err; + + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || handle != NULL || create == 0); + ASSERT(create == 0 || !nowait); + + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, map_flags); + + if (err == 0) + return create ? ERR_PTR(-ENOSPC) : NULL; + if (err < 0) + return ERR_PTR(err); + + if (nowait) + return sb_find_get_block(inode->i_sb, map.m_pblk); + + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + if (map.m_flags & EXT4_MAP_NEW) { + ASSERT(create != 0); + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || (handle != NULL)); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (unlikely(err)) { + unlock_buffer(bh); + goto errout; + } + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto errout; + } else + BUFFER_TRACE(bh, "not a new buffer"); + return bh; +errout: + brelse(bh); + return ERR_PTR(err); +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int map_flags) +{ + struct buffer_head *bh; + int ret; + + bh = ext4_getblk(handle, inode, block, map_flags); + if (IS_ERR(bh)) + return bh; + if (!bh || ext4_buffer_uptodate(bh)) + return bh; + + ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); + if (ret) { + put_bh(bh); + return ERR_PTR(ret); + } + return bh; +} + +/* Read a contiguous batch of blocks. */ +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs) +{ + int i, err; + + for (i = 0; i < bh_count; i++) { + bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); + if (IS_ERR(bhs[i])) { + err = PTR_ERR(bhs[i]); + bh_count = i; + goto out_brelse; + } + } + + for (i = 0; i < bh_count; i++) + /* Note that NULL bhs[i] is valid because of holes. */ + if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) + ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); + + if (!wait) + return 0; + + for (i = 0; i < bh_count; i++) + if (bhs[i]) + wait_on_buffer(bhs[i]); + + for (i = 0; i < bh_count; i++) { + if (bhs[i] && !buffer_uptodate(bhs[i])) { + err = -EIO; + goto out_brelse; + } + } + return 0; + +out_brelse: + for (i = 0; i < bh_count; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + return err; +} + +int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, struct inode *inode, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, inode, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable. + */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + +int do_journal_get_write_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + BUFFER_TRACE(bh, "get write access"); + return ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); +} + +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned int from = offset_in_folio(folio, pos); + unsigned to = from + len; + struct inode *inode = folio->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned bbits; + struct buffer_head *bh, *head, *wait[2]; + int nr_wait = 0; + int i; + bool should_journal_data = ext4_should_journal_data(inode); + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(to > folio_size(folio)); + BUG_ON(from > to); + + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + bbits = ilog2(blocksize); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (folio_test_uptodate(folio)) { + set_buffer_uptodate(bh); + } + continue; + } + if (WARN_ON_ONCE(buffer_new(bh))) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); + if (folio_test_uptodate(folio)) { + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ + set_buffer_uptodate(bh); + continue; + } + if (block_end > to || block_start < from) + folio_zero_segments(folio, to, + block_end, + block_start, from); + continue; + } + } + if (folio_test_uptodate(folio)) { + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ext4_read_bh_lock(bh, 0, false); + wait[nr_wait++] = bh; + } + } + /* + * If we issued read requests, let them complete. + */ + for (i = 0; i < nr_wait; i++) { + wait_on_buffer(wait[i]); + if (!buffer_uptodate(wait[i])) + err = -EIO; + } + if (unlikely(err)) { + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); + } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { + for (i = 0; i < nr_wait; i++) { + int err2; + + err2 = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, bh_offset(wait[i])); + if (err2) { + clear_buffer_uptodate(wait[i]); + err = err2; + } + } + } + + return err; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret, needed_blocks; + handle_t *handle; + int retries = 0; + struct folio *folio; + pgoff_t index; + unsigned from, to; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + trace_ext4_write_begin(inode, pos, len); + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason + */ + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; + index = pos >> PAGE_SHIFT; + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + foliop); + if (ret < 0) + return ret; + if (ret == 1) + return 0; + } + + /* + * write_begin_get_folio() can take a long time if the + * system is thrashing due to memory pressure, or if the folio + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the folio (if needed) without using GFP_NOFS. + */ +retry_grab: + folio = write_begin_get_folio(iocb, mapping, index, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + + /* + * The same as page allocation, we prealloc buffer heads before + * starting the handle. + */ + if (!folio_buffers(folio)) + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); + + folio_unlock(folio); + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + folio_put(folio); + return PTR_ERR(handle); + } + + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); + ext4_journal_stop(handle); + goto retry_grab; + } + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); + + if (ext4_should_dioread_nolock(inode)) + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block_unwritten); + else + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); + } + + if (ret) { + bool extended = (pos + len > inode->i_size) && + !ext4_verity_in_progress(inode); + + folio_unlock(folio); + /* + * ext4_block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_rwsem. + * + * Add inode to orphan list in case we crash before + * truncate finishes + */ + if (extended && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ext4_journal_stop(handle); + if (extended) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) + goto retry_journal; + folio_put(folio); + return ret; + } + *foliop = folio; + return ret; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + ret = ext4_dirty_journalled_data(handle, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + clear_buffer_new(bh); + return ret; +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `iocb` can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + int ret = 0, ret2; + int i_size_changed = 0; + bool verity = ext4_verity_in_progress(inode); + + trace_ext4_write_end(inode, pos, len, copied); + + if (ext4_has_inline_data(inode) && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); + + copied = block_write_end(pos, len, copied, folio); + /* + * it's important to update i_size while still holding folio lock: + * page writeout could otherwise come in and zero beyond i_size. + * + * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree + * blocks are being written past EOF, so skip the i_size update. + */ + if (!verity) + i_size_changed = ext4_update_inode_size(inode, pos + copied); + folio_unlock(folio); + folio_put(folio); + + if (old_size < pos && !verity) { + pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } + /* + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + ret = ext4_mark_inode_dirty(handle, inode); + + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size && !verity) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +/* + * This is a private version of folio_zero_new_buffers() which doesn't + * set the buffer to be dirty, since in data=journalled mode we need + * to call ext4_dirty_journalled_data() instead. + */ +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to) +{ + unsigned int block_start = 0, block_end; + struct buffer_head *head, *bh; + + bh = head = folio_buffers(folio); + do { + block_end = block_start + bh->b_size; + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!folio_test_uptodate(folio)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + folio_zero_range(folio, start, size); + } + clear_buffer_new(bh); + write_end_fn(handle, inode, bh); + } + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + +static int ext4_journalled_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + int size_changed = 0; + bool verity = ext4_verity_in_progress(inode); + + trace_ext4_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_SIZE - 1); + to = from + len; + + BUG_ON(!ext4_handle_valid(handle)); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); + + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { + copied = 0; + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + } else { + if (unlikely(copied < len)) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from + copied, to); + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), + from, from + copied, &partial, + write_end_fn); + if (!partial) + folio_mark_uptodate(folio); + } + if (!verity) + size_changed = ext4_update_inode_size(inode, pos + copied); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + folio_unlock(folio); + folio_put(folio); + + if (old_size < pos && !verity) { + pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } + + if (size_changed) { + ret2 = ext4_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size && !verity) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +/* + * Reserve space for 'nr_resv' clusters + */ +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + int ret; + + /* + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. + */ + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); + if (ret) + return ret; + + spin_lock(&ei->i_block_reservation_lock); + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { + spin_unlock(&ei->i_block_reservation_lock); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); + return -ENOSPC; + } + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + trace_ext4_da_release_space(inode, to_free); + if (unlikely(to_free > ei->i_reserved_data_blocks)) { + /* + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. + */ + ext4_warning(inode->i_sb, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; + } + ei->i_reserved_data_blocks -= to_free; + + /* update fs dirty data blocks counter */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + /* These are input fields for ext4_do_writepages() */ + struct inode *inode; + struct writeback_control *wbc; + unsigned int can_map:1; /* Can writepages call map blocks? */ + + /* These are internal state of ext4_do_writepages() */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + + /* + * Extent to map - this can be after start_pos because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. + */ + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ + unsigned int do_map:1; + unsigned int scanned_until_end:1; + unsigned int journalled_more_data:1; +}; + +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) +{ + unsigned nr, i; + pgoff_t index, end; + struct folio_batch fbatch; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) + return; + + mpd->scanned_until_end = 0; + if (invalidate) { + ext4_lblk_t start, last; + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; + + /* + * avoid racing with extent status tree scans made by + * ext4_insert_delayed_block() + */ + down_write(&EXT4_I(inode)->i_data_sem); + ext4_es_remove_extent(inode, start, last - start); + up_write(&EXT4_I(inode)->i_data_sem); + } + + folio_batch_init(&fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (nr == 0) + break; + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + if (folio_pos(folio) < mpd->start_pos) + continue; + if (folio_next_index(folio) > end) + continue; + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); + if (invalidate) { + if (folio_mapped(folio)) + folio_clear_dirty_for_io(folio); + block_invalidate_folio(folio, 0, + folio_size(folio)); + folio_clear_uptodate(folio); + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + } +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(sb))); + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + ei->i_reserved_data_blocks); + return; +} + +/* + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed + * + * @inode - file containing the newly added block + * @lblk - start logical block to be added + * @len - length of blocks to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; + + /* + * If the cluster containing lblk or end is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode, len); + if (ret != 0) /* ENOSPC */ + return ret; + } else { /* bigalloc */ + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); + } + } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } + } + + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); + return 0; +} + +/* + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. + */ +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) +{ + struct extent_status es; + int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif + + map->m_flags = 0; + ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, + (unsigned long) map->m_lblk); + + ext4_check_map_extents_env(inode); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (ext4_es_is_hole(&es)) + goto add_delayed; + +found: + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG(); + +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif + return 0; + } + + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_has_inline_data(inode)) + retval = 0; + else + retval = ext4_map_query_blocks(NULL, inode, map, 0); + up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; + +add_delayed: + down_write(&EXT4_I(inode)->i_data_sem); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map, 0); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; + } + } + + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + up_write(&EXT4_I(inode)->i_data_sem); + + return retval; +} + +/* + * This is a special get_block_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + */ +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map.m_lblk = iblock; + map.m_len = 1; + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) + return ret; + + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map_bh(bh, inode->i_sb, map.m_pblk); + ext4_update_bh_state(bh, map.m_flags); + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; +} + +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) +{ + mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); + folio_unlock(folio); +} + +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) +{ + size_t len; + loff_t size; + int err; + + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); + folio_clear_dirty_for_io(folio); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. folio_clear_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release folio lock. So only after + * folio_clear_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_folio() to zero-out tail of the written page. We rely + * on the barrier provided by folio_test_clear_dirty() in + * folio_clear_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); + len = folio_size(folio); + if (folio_pos(folio) + len > size && + !ext4_verity_in_progress(mpd->inode)) + len = size & (len - 1); + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); + + return err; +} + +#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) + +/* + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks up to full group size. + */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 + +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @bh - buffer head we want to add to the extent + * + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. + */ +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } + + /* First block in the extent? */ + if (map->m_len == 0) { + /* We cannot map unless handle is started... */ + if (!mpd->do_map) + return false; + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; + } + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (bh->b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return true; + } + return false; +} + +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + int err; + ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) + >> inode->i_blkbits; + + if (ext4_verity_in_progress(inode)) + blocks = EXT_MAX_BLOCKS; + + do { + BUG_ON(buffer_locked(bh)); + + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { + /* Found extent to map? */ + if (mpd->map.m_len) + return 0; + /* Buffer needs mapping and handle is not started? */ + if (!mpd->do_map) + return 0; + /* Everything mapped so far and we hit EOF */ + break; + } + } while (lblk++, (bh = bh->b_this_page) != head); + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_folio(mpd, head->b_folio); + if (err < 0) + return err; + mpage_folio_done(mpd, head->b_folio); + } + if (lblk >= blocks) { + mpd->scanned_until_end = 1; + return 0; + } + return 1; +} + +/* + * mpage_process_folio - update folio buffers corresponding to changed extent + * and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further + * mapping or not. + * + * Scan given folio buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true. + */ +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = folio_buffers(folio); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the folio to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to unwritten extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) +{ + struct folio_batch fbatch; + unsigned nr, i; + struct inode *inode = mpd->inode; + int bpp_bits = PAGE_SHIFT - inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + ext4_fsblk_t pblock; + int err; + bool map_bh = false; + + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + pblock = mpd->map.m_pblk; + + folio_batch_init(&fbatch); + while (start <= end) { + nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); + if (nr == 0) + break; + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + lblk = folio->index << bpp_bits; + err = mpage_process_folio(mpd, folio, &lblk, &pblock, + &map_bh); + /* + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. + */ + if (err < 0 || map_bh) + goto out; + /* Page fully mapped - let IO run! */ + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; + mpage_folio_done(mpd, folio); + } + folio_batch_release(&fbatch); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +out: + folio_batch_release(&fbatch); + return err; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err, dioread_nolock; + + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + + trace_ext4_da_write_pages_extent(inode, map); + /* + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an unwritten extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. + */ + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); + } + + BUG_ON(map->m_len == 0); + return 0; +} + +/* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +/* + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + * is no hope of writing the data. The caller should discard + * dirty pages to avoid infinite loops. + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. + */ +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; + int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + do { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (ext4_emergency_state(sb)) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || (err == -EAGAIN) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) { + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; + goto update_disksize; + } + return err; + } + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + progress = 1; + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + goto update_disksize; + } while (map->m_len); + +update_disksize: + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + disksize = mpd->start_pos; + if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { + int err2; + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + err2 = ext4_mark_inode_dirty(handle, inode); + if (err2) { + ext4_error_err(inode->i_sb, -err2, + "Failed to mark inode %lu dirty", + inode->i_ino); + } + if (!err) + err = err2; + } + return err; +} + +static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, + size_t len) +{ + struct buffer_head *page_bufs = folio_buffers(folio); + struct inode *inode = folio->mapping->host; + int ret, err; + + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct folio *folio) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + size_t len = folio_size(folio); + + folio_clear_checked(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); + + if (folio_pos(folio) + len > size && + !ext4_verity_in_progress(inode)) + len = size & (len - 1); + + return ext4_journal_folio_buffers(handle, folio, len); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * needing mapping, submit mapped pages + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. If we cannot map blocks, we submit just already mapped + * buffers in the page for IO and keep page dirty. When we can map blocks and + * we find a page which isn't mapped we start accumulating extent of buffers + * underlying these pages that needs mapping (formed by either delayed or + * unwritten buffers). We also lock the pages containing these buffers. The + * extent found is returned in @mpd structure (starting at mpd->lblk with + * length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct folio_batch fbatch; + unsigned int nr_folios; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; + xa_mark_t tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_folio(mpd->inode); + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + mpd->map.m_len = 0; + mpd->next_pos = mpd->start_pos; + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + folio_batch_init(&fbatch); + while (index <= end) { + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) + break; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + goto out; + + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) + goto out; + + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + + folio_lock(folio); + /* + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page + */ + if (!folio_test_dirty(folio) || + (folio_test_writeback(folio) && + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + continue; + } + + folio_wait_writeback(folio); + BUG_ON(folio_test_writeback(folio)); + + /* + * Should never happen but for buggy code in + * other subsystems that call + * set_page_dirty() without properly warning + * the file system first. See [1] for more + * information. + * + * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz + */ + if (!folio_buffers(folio)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); + folio_clear_dirty(folio); + folio_unlock(folio); + continue; + } + + if (mpd->map.m_len == 0) + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_pos(folio) + folio_size(folio); + /* + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * makes sure current data is checkpointed to the final + * location before possibly journalling it again which + * is desirable when the page is frequently dirtied + * through a pin. + */ + if (!mpd->can_map) { + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; + /* Pending dirtying of journalled data? */ + if (folio_test_checked(folio)) { + err = mpage_journal_page_buffers(handle, + mpd, folio); + if (err < 0) + goto out; + mpd->journalled_more_data = 1; + } + mpage_folio_done(mpd, folio); + } else { + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)folio->index) << + (PAGE_SHIFT - blkbits); + head = folio_buffers(folio); + err = mpage_process_page_bufs(mpd, head, head, + lblk); + if (err <= 0) + goto out; + err = 0; + } + } + folio_batch_release(&fbatch); + cond_resched(); + } + mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); + return 0; +out: + folio_batch_release(&fbatch); + if (handle) + ext4_journal_stop(handle); + return err; +} + +static int ext4_do_writepages(struct mpage_da_data *mpd) +{ + struct writeback_control *wbc = mpd->wbc; + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; + int range_whole = 0; + int cycled = 1; + handle_t *handle = NULL; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int needed_blocks, rsv_blocks = 0, ret = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + struct blk_plug plug; + bool give_up_on_write = false; + + trace_ext4_writepages(inode, wbc); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + goto out_writepages; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * fs shutdown state instead of sb->s_flag's SB_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) + goto out_writepages; + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } + + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers. On the other hand we need to commit + * transaction to make data stable. We expect all the data to be + * already in the journal (the only exception are DMA pinned pages + * dirtied behind our back) so we commit transaction here and run the + * writeback loop to checkpoint them. The checkpointing is not actually + * necessary to make data persistent *but* quite a few places (extent + * shifting operations, fsverity, ...) depend on being able to drop + * pagecache pages after calling filemap_write_and_wait() and for that + * checkpointing needs to happen. + */ + if (ext4_should_journal_data(inode)) { + mpd->can_map = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + ext4_fc_commit(sbi->s_journal, + EXT4_I(inode)->i_datasync_tid); + } + mpd->journalled_more_data = 0; + + if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); + /* + * We may need to convert up to one extent per block in + * the folio and we may dirty the inode. + */ + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); + } + + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; + if (writeback_index) + cycled = 0; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; + } else { + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; + } + + ext4_io_submit_init(&mpd->io_submit, wbc); +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); + blk_start_plug(&plug); + + /* + * First writeback pages that don't need mapping - we can avoid + * starting a transaction unnecessarily and also avoid being blocked + * in the block layer on device congestion while having transaction + * started. + */ + mpd->do_map = 0; + mpd->scanned_until_end = 0; + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { + ret = -ENOMEM; + goto unplug; + } + ret = mpage_prepare_extent_to_map(mpd); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(mpd, false); + /* Submit prepared bio */ + ext4_io_submit(&mpd->io_submit); + ext4_put_io_end_defer(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; + if (ret < 0) + goto unplug; + + while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { + /* For each extent of pages we use new io_end */ + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { + ret = -ENOMEM; + break; + } + + WARN_ON_ONCE(!mpd->can_map); + /* + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. + */ + BUG_ON(ext4_should_journal_data(inode)); + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " + "%ld pages, ino %lu; err %d", __func__, + wbc->nr_to_write, inode->i_ino, ret); + /* Release allocated io_end */ + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; + break; + } + mpd->do_map = 1; + + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); + ret = mpage_prepare_extent_to_map(mpd); + if (!ret && mpd->map.m_len) + ret = mpage_map_and_submit_extent(handle, mpd, + &give_up_on_write); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + mpd->do_map = 0; + } + /* Unlock pages we didn't use */ + mpage_release_unused_pages(mpd, give_up_on_write); + /* Submit prepared bio */ + ext4_io_submit(&mpd->io_submit); + + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd->io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + ret = 0; + continue; + } + if (ret == -EAGAIN) + ret = 0; + /* Fatal error - ENOMEM, EIO... */ + if (ret) + break; + } +unplug: + blk_finish_plug(&plug); + if (!ret && !cycled && wbc->nr_to_write > 0) { + cycled = 1; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; + goto retry; + } + + /* Update index */ + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * Set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; + +out_writepages: + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + return ret; +} + +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct super_block *sb = mapping->host->i_sb; + struct mpage_da_data mpd = { + .inode = mapping->host, + .wbc = wbc, + .can_map = 1, + }; + int ret; + int alloc_ctx; + + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; + + alloc_ctx = ext4_writepages_down_read(sb); + ret = ext4_do_writepages(&mpd); + /* + * For data=journal writeback we could have come across pages marked + * for delayed dirtying (PageChecked) which were just added to the + * running transaction. Try once more to get them to stable storage. + */ + if (!ret && mpd.journalled_more_data) + ret = ext4_do_writepages(&mpd); + ext4_writepages_up_read(sb, alloc_ctx); + + return ret; +} + +int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + struct mpage_da_data mpd = { + .inode = jinode->i_vfs_inode, + .wbc = &wbc, + .can_map = 0, + }; + return ext4_do_writepages(&mpd); +} + +static int ext4_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + long nr_to_write = wbc->nr_to_write; + struct inode *inode = mapping->host; + int alloc_ctx; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + alloc_ctx = ext4_writepages_down_read(inode->i_sb); + trace_ext4_writepages(inode, wbc); + + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); + return ret; +} + +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_clusters, dirty_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { + /* + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark + */ + return 1; + } + return 0; +} + +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) +{ + int ret, retries = 0; + struct folio *folio; + pgoff_t index; + struct inode *inode = mapping->host; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + index = pos >> PAGE_SHIFT; + + if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(iocb, mapping, pos, + len, foliop, fsdata); + } + *fsdata = (void *)0; + trace_ext4_da_write_begin(inode, pos, len); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); + if (ret < 0) + return ret; + if (ret == 1) + return 0; + } + +retry: + folio = write_begin_get_folio(iocb, mapping, index, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + /* + * ext4_block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold inode lock. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return ret; + } + + *foliop = folio; + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct folio *folio, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = folio->mapping->host; + unsigned int idx; + int i; + + bh = folio_buffers(folio); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) + return 0; + return 1; +} + +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size, zero_len = 0; + handle_t *handle; + + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(pos, len, copied, folio); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding folio lock, + * because folio writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = offset_in_folio(folio, new_i_size - 1); + if (copied && ext4_da_should_update_i_disksize(folio, end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + folio_unlock(folio); + folio_put(folio); + + if (pos > old_size) { + pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } + + if (!disksize_changed && !zero_len) + return copied; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return copied; +} + +static int ext4_da_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) +{ + struct inode *inode = mapping->host; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(iocb, mapping, pos, + len, copied, folio, fsdata); + + trace_ext4_da_write_end(inode, pos, len, copied); + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); + + if (unlikely(copied < len) && !folio_test_uptodate(folio)) + copied = 0; + + return ext4_da_do_write_end(mapping, pos, len, copied, folio); +} + +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + trace_ext4_alloc_da_blocks(inode); + + if (!EXT4_I(inode)->i_reserved_data_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writepage() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them because we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + sector_t ret = 0; + + inode_lock_shared(inode); + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + goto out; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + (test_opt(inode->i_sb, DELALLOC) || + ext4_should_journal_data(inode))) { + /* + * With delalloc or journalled data we want to sync the file so + * that we can make sure we allocate blocks for file and data + * is in place for the user to see it + */ + filemap_write_and_wait(mapping); + } + + ret = iomap_bmap(mapping, block, &ext4_iomap_ops); + +out: + inode_unlock_shared(inode); + return ret; +} + +static int ext4_read_folio(struct file *file, struct folio *folio) +{ + int ret = -EAGAIN; + struct inode *inode = folio->mapping->host; + + trace_ext4_read_folio(inode, folio); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, folio); + + if (ret == -EAGAIN) + return ext4_mpage_readpages(inode, NULL, folio); + + return ret; +} + +static void ext4_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + + /* If the file has inline data, no need to do readahead. */ + if (ext4_has_inline_data(inode)) + return; + + ext4_mpage_readpages(inode, rac, NULL); +} + +static void ext4_invalidate_folio(struct folio *folio, size_t offset, + size_t length) +{ + trace_ext4_invalidate_folio(folio, offset, length); + + /* No journalling happens on data buffers when this function is used */ + WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); + + block_invalidate_folio(folio, offset, length); +} + +static int __ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + journal_t *journal = EXT4_JOURNAL(folio->mapping->host); + + trace_ext4_journalled_invalidate_folio(folio, offset, length); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0 && length == folio_size(folio)) + folio_clear_checked(folio); + + return jbd2_journal_invalidate_folio(journal, folio, offset, length); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, + size_t length) +{ + WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); +} + +static bool ext4_release_folio(struct folio *folio, gfp_t wait) +{ + struct inode *inode = folio->mapping->host; + journal_t *journal = EXT4_JOURNAL(inode); + + trace_ext4_release_folio(inode, folio); + + /* Page has dirty journalled data -> cannot release */ + if (folio_test_checked(folio)) + return false; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, folio); + else + return try_to_free_buffers(folio); +} + +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) { + if (jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid)) + return false; + if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + return !list_empty(&EXT4_I(inode)->i_fc_list); + return true; + } + + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->i_private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length, unsigned int flags) +{ + u8 blkbits = inode->i_blkbits; + + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ + iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + + if (flags & IOMAP_DAX) + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + else + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; + + if ((map->m_flags & EXT4_MAP_MAPPED) && + !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + iomap->flags |= IOMAP_F_MERGED; + + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } +} + +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) +{ + handle_t *handle; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; + + /* + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. + */ + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } + +retry: + /* + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); + if (flags & IOMAP_DAX) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); + + /* + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. + */ + if (!m_flags && !ret) + ret = -ENOTBLK; + + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + + return ret; +} + + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; + + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + + /* + * Calculate the first and last logical blocks respectively. + */ + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; + + if (flags & IOMAP_WRITE) { + /* + * We check here if the blocks are already allocated, then we + * don't need to start a journal txn and we can directly return + * the mapping information. This could boost performance + * especially in multi-threaded overwrite requests. + */ + if (offset + length <= i_size_read(inode)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; + } + ret = ext4_iomap_alloc(inode, &map, flags); + } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ + ret = ext4_map_blocks(NULL, inode, &map, 0); + } + + if (ret < 0) + return ret; +out: + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } + ext4_set_iomap(inode, iomap, &map, offset, length, flags); + + return 0; +} + +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + int ret; + + /* + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. + */ + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); + return ret; +} + +const struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, +}; + +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, +}; + +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } + + /* + * Calculate the first and last logical block respectively. + */ + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + /* + * Fiemap callers may call for offset beyond s_bitmap_maxbytes. + * So handle it here itself instead of querying ext4_map_blocks(). + * Since ext4_map_blocks() will warn about it and will return + * -EIO error. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (offset >= sbi->s_bitmap_maxbytes) { + map.m_flags = 0; + goto set_iomap; + } + } + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; +set_iomap: + ext4_set_iomap(inode, iomap, &map, offset, length, flags); + + return 0; +} + +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; + +/* + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. + */ +static bool ext4_journalled_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + WARN_ON_ONCE(!folio_buffers(folio)); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); + return filemap_dirty_folio(mapping, folio); +} + +static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); + WARN_ON_ONCE(!folio_buffers(folio)); + return block_dirty_folio(mapping, folio); +} + +static int ext4_iomap_swap_activate(struct swap_info_struct *sis, + struct file *file, sector_t *span) +{ + return iomap_swapfile_activate(sis, file, span, + &ext4_iomap_report_ops); +} + +static const struct address_space_operations ext4_aops = { + .read_folio = ext4_read_folio, + .readahead = ext4_readahead, + .writepages = ext4_writepages, + .write_begin = ext4_write_begin, + .write_end = ext4_write_end, + .dirty_folio = ext4_dirty_folio, + .bmap = ext4_bmap, + .invalidate_folio = ext4_invalidate_folio, + .release_folio = ext4_release_folio, + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, + .swap_activate = ext4_iomap_swap_activate, +}; + +static const struct address_space_operations ext4_journalled_aops = { + .read_folio = ext4_read_folio, + .readahead = ext4_readahead, + .writepages = ext4_writepages, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .dirty_folio = ext4_journalled_dirty_folio, + .bmap = ext4_bmap, + .invalidate_folio = ext4_journalled_invalidate_folio, + .release_folio = ext4_release_folio, + .migrate_folio = buffer_migrate_folio_norefs, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, + .swap_activate = ext4_iomap_swap_activate, +}; + +static const struct address_space_operations ext4_da_aops = { + .read_folio = ext4_read_folio, + .readahead = ext4_readahead, + .writepages = ext4_writepages, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .dirty_folio = ext4_dirty_folio, + .bmap = ext4_bmap, + .invalidate_folio = ext4_invalidate_folio, + .release_folio = ext4_release_folio, + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, + .swap_activate = ext4_iomap_swap_activate, +}; + +static const struct address_space_operations ext4_dax_aops = { + .writepages = ext4_dax_writepages, + .dirty_folio = noop_dirty_folio, + .bmap = ext4_bmap, + .swap_activate = ext4_iomap_swap_activate, +}; + +void ext4_set_aops(struct inode *inode) +{ + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + case EXT4_INODE_WRITEBACK_DATA_MODE: + break; + case EXT4_INODE_JOURNAL_DATA_MODE: + inode->i_mapping->a_ops = &ext4_journalled_aops; + return; + default: + BUG(); + } + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &ext4_dax_aops; + else if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; +} + +/* + * Here we can't skip an unwritten buffer even though it usually reads zero + * because it might have data in pagecache (eg, if called from ext4_zero_range, + * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a + * racing writeback can come later and flush the stale pagecache to disk. + */ +static int __ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + unsigned int offset, blocksize, pos; + ext4_lblk_t iblock; + struct inode *inode = mapping->host; + struct buffer_head *bh; + struct folio *folio; + int err = 0; + + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + blocksize = inode->i_sb->s_blocksize; + + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + + bh = folio_buffers(folio); + if (!bh) + bh = create_empty_buffers(folio, blocksize, 0); + + /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (folio_test_uptodate(folio)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = ext4_read_bh_lock(bh, 0, true); + if (err) + goto unlock; + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { + /* We expect the key to be set. */ + BUG_ON(!fscrypt_has_encryption_key(inode)); + err = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } + } + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto unlock; + } + folio_zero_range(folio, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); + + if (ext4_should_journal_data(inode)) { + err = ext4_dirty_journalled_data(handle, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_should_order_data(inode)) + err = ext4_jbd2_inode_add_write(handle, inode, from, + length); + } + +unlock: + folio_unlock(folio); + folio_put(folio); + return err; +} + +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that corresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) { + return dax_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); + } + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + /* If we are processing an encrypted inode during orphan list handling */ + if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) + return 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; + + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); + + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; + + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; + } + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); + return err; +} + +int ext4_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + +/* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen if we remove the folio containing i_size from the + * page cache. Also if we punch hole within i_size but above i_disksize, + * following ext4_page_mkwrite() may mistakenly allocate written blocks over + * the hole and thus introduce allocated blocks beyond i_disksize which is + * not allowed (e2fsck would complain in case of crash). + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + int ret; + + loff_t size = i_size_read(inode); + + WARN_ON(!inode_is_locked(inode)); + if (offset > size) + return 0; + + if (offset + len < size) + size = offset + len; + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ret = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return ret; +} + +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + +static void ext4_wait_dax_page(struct inode *inode) +{ + filemap_invalidate_unlock(inode->i_mapping); + schedule(); + filemap_invalidate_lock(inode->i_mapping); +} + +int ext4_break_layouts(struct inode *inode) +{ + if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) + return -EINVAL; + + return dax_break_layout_inode(inode, ext4_wait_dax_page); +} + +/* + * ext4_punch_hole: punches a hole in a file by releasing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on success or negative on failure + */ + +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; + handle_t *handle; + unsigned int credits; + int ret; + + trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* + * For indirect-block based inodes, make sure that the hole within + * one block before last range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size || offset >= max_end) + return 0; + + /* + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size. + */ + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; + + /* + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. + */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; + } + + + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + return ret; + + /* Now release the pages and zero block aligned part of pages*/ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_chunk_trans_extent(inode, 2); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + return ret; + } + + ret = ext4_zero_partial_blocks(handle, inode, offset, length); + if (ret) + goto out_handle; + + /* If there are blocks to remove, do it */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; + + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ext4_es_remove_extent(inode, start_lblk, hole_len); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); + else + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } + + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, + EXTENT_STATUS_HOLE, 0); + up_write(&EXT4_I(inode)->i_data_sem); + } + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_handle: + ext4_journal_stop(handle); + return ret; +} + +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + +/* + * ext4_truncate() + * + * We block out ext4_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext4_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext4_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext4 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext4_truncate() run will find them and release them. + */ +int ext4_truncate(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + int err = 0, err2; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + + /* + * There is a possibility that we're either freeing the inode + * or it's a completely new inode. In those cases we might not + * have i_rwsem locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!inode_is_locked(inode)); + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + goto out_trace; + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + err = ext4_inline_data_truncate(inode, &has_inline); + if (err || has_inline) + goto out_trace; + } + + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + err = ext4_inode_attach_jinode(inode); + if (err) + goto out_trace; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_chunk_trans_extent(inode, 1); + else + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_trace; + } + + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + err = ext4_orphan_add(handle, inode); + if (err) + goto out_stop; + + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + err = ext4_ext_truncate(handle, inode); + else + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + if (err) + goto out_stop; + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_evict_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; + ext4_journal_stop(handle); + +out_trace: + trace_ext4_truncate_exit(inode); + return err; +} + +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + +static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = READ_ONCE(inode->i_blocks); + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represented in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ + if (!ext4_has_feature_huge_file(sb)) + return -EFSCORRUPTED; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + uid_t i_uid; + gid_t i_gid; + projid_t i_projid; + int block; + int err; + + err = ext4_inode_blocks_set(raw_inode, ei); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ + if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(i_gid)); + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_CTIME(inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else if (!ext4_has_inline_data(inode)) { + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + } + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + u64 ivers = ext4_inode_peek_iversion(inode); + + raw_inode->i_disk_version = cpu_to_le32(ivers); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(ivers >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } + } + + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + + ext4_inode_csum_set(inode, raw_inode, ei); + return err; +} + +/* + * ext4_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If we pass 'inode' and it does not + * have in-inode xattr, we have all inode data in memory that is needed + * to recreate the on-disk version of this inode. + */ +static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, + struct inode *inode, struct ext4_iloc *iloc, + ext4_fsblk_t *ret_block) +{ + struct ext4_group_desc *gdp; + struct buffer_head *bh; + ext4_fsblk_t block; + struct blk_plug plug; + int inodes_per_block, inode_offset; + + iloc->bh = NULL; + if (ino < EXT4_ROOT_INO || + ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return -EFSCORRUPTED; + + iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inode_offset = ((ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + block = ext4_inode_table(sb, gdp); + if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || + (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { + ext4_error(sb, "Invalid inode table block %llu in " + "block_group %u", block, iloc->block_group); + return -EFSCORRUPTED; + } + block += (inode_offset / inodes_per_block); + + bh = sb_getblk(sb, block); + if (unlikely(!bh)) + return -ENOMEM; + if (ext4_buffer_uptodate(bh)) + goto has_buffer; + + lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) { + /* Someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (unlikely(!bitmap_bh)) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + struct ext4_inode *raw_inode = + (struct ext4_inode *) (bh->b_data + iloc->offset); + + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) + ext4_fill_raw_inode(inode, raw_inode); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + blk_start_plug(&plug); + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~((ext4_fsblk_t) ra_blks - 1); + if (table > b) + b = table; + end = b + ra_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + ext4_sb_breadahead_unmovable(sb, b++); + } + + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(sb, ino); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); + blk_finish_plug(&plug); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + if (ret_block) + *ret_block = block; + brelse(bh); + return -EIO; + } +has_buffer: + iloc->bh = bh; + return 0; +} + +static int __ext4_get_inode_loc_noinmem(struct inode *inode, + struct ext4_iloc *iloc) +{ + ext4_fsblk_t err_blk = 0; + int ret; + + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, + &err_blk); + + if (ret == -EIO) + ext4_error_inode_block(inode, err_blk, EIO, + "unable to read itable block"); + + return ret; +} + +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) +{ + ext4_fsblk_t err_blk = 0; + int ret; + + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, + &err_blk); + + if (ret == -EIO) + ext4_error_inode_block(inode, err_blk, EIO, + "unable to read itable block"); + + return ret; +} + + +int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc) +{ + return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); +} + +static bool ext4_should_enable_dax(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (test_opt2(inode->i_sb, DAX_NEVER)) + return false; + if (!S_ISREG(inode->i_mode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) + return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) + return false; + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) + return false; + if (test_opt(inode->i_sb, DAX_ALWAYS)) + return true; + + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); +} + +void ext4_set_inode_flags(struct inode *inode, bool init) +{ + unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; + + WARN_ON_ONCE(IS_DAX(inode) && init); + + if (flags & EXT4_SYNC_FL) + new_fl |= S_SYNC; + if (flags & EXT4_APPEND_FL) + new_fl |= S_APPEND; + if (flags & EXT4_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + + /* Because of the way inode_set_flags() works we must preserve S_DAX + * here if already set. */ + new_fl |= (inode->i_flags & S_DAX); + if (init && ext4_should_enable_dax(inode)) + new_fl |= S_DAX; + + if (flags & EXT4_ENCRYPT_FL) + new_fl |= S_ENCRYPTED; + if (flags & EXT4_CASEFOLD_FL) + new_fl |= S_CASEFOLD; + if (flags & EXT4_VERITY_FL) + new_fl |= S_VERITY; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| + S_ENCRYPTED|S_CASEFOLD|S_VERITY); +} + +static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + blkcnt_t i_blocks ; + struct inode *inode = &(ei->vfs_inode); + struct super_block *sb = inode->i_sb; + + if (ext4_has_feature_huge_file(sb)) { + /* we are using combined 48 bit field */ + i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | + le32_to_cpu(raw_inode->i_blocks_lo); + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { + /* i_blocks represent file system block size */ + return i_blocks << (inode->i_blkbits - 9); + } else { + return i_blocks; + } + } else { + return le32_to_cpu(raw_inode->i_blocks_lo); + } +} + +static inline int ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + + if (EXT4_INODE_HAS_XATTR_SPACE(inode) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + int err; + + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + err = ext4_find_inline_data_nolock(inode); + if (!err && ext4_has_inline_data(inode)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return err; + } else + EXT4_I(inode)->i_inline_off = 0; + return 0; +} + +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!ext4_has_feature_project(inode->i_sb)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + +/* + * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of + * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag + * set. + */ +static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else + inode_set_iversion_queried(inode, val); +} + +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) +{ + const char *err_str; + + if (flags & EXT4_IGET_EA_INODE) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } + } else { + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; + } + return 0; + +error: + ext4_error_inode(inode, function, line, 0, "%s", err_str); + return -EFSCORRUPTED; +} + +static bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(i) \ + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) +void ext4_set_inode_mapping_order(struct inode *inode) +{ + if (!ext4_should_enable_large_folio(inode)) + return; + + mapping_set_folio_order_range(inode->i_mapping, 0, + EXT4_MAX_PAGECACHE_ORDER(inode)); +} + +struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line) +{ + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + loff_t size; + int block; + uid_t i_uid; + gid_t i_gid; + projid_t i_projid; + + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || + (ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(es->s_inodes_count))) { + if (flags & EXT4_IGET_HANDLE) + return ERR_PTR(-ESTALE); + __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, + "inode #%lu: comm %s: iget: illegal inode #", + ino, current->comm); + return ERR_PTR(-EFSCORRUPTED); + } + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) { + ret = check_igot_inode(inode, flags, function, line); + if (ret) { + iput(inode); + return ERR_PTR(ret); + } + return inode; + } + + ei = EXT4_I(inode); + iloc.bh = NULL; + + ret = __ext4_get_inode_loc_noinmem(inode, &iloc); + if (ret < 0) + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb) || + (ei->i_extra_isize & 3)) { + ext4_error_inode(inode, function, line, 0, + "iget: bad extra_isize %u " + "(inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + ret = -EFSCORRUPTED; + goto bad_inode; + } + } else + ei->i_extra_isize = 0; + + /* Precompute checksum seed for inode metadata */ + if (ext4_has_feature_metadata_csum(sb)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = raw_inode->i_generation; + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); + } + + if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && + (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { + ext4_error_inode_err(inode, function, line, 0, + EFSBADCRC, "iget: checksum invalid"); + ret = -EFSBADCRC; + goto bad_inode; + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (ext4_has_feature_project(sb) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + + if (!(test_opt(inode->i_sb, NO_UID32))) { + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { + /* this inode is deleted or unallocated */ + if (flags & EXT4_IGET_SPECIAL) { + ext4_error_inode(inode, function, line, 0, + "iget: special inode unallocated"); + ret = -EFSCORRUPTED; + } else + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ + } + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext4_set_inode_flags(inode, true); + /* Detect invalid flag combination - can't have both inline data and extents */ + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_error_inode(inode, function, line, 0, + "inode has both inline data and extents flags"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode->i_blocks = ext4_inode_blocks(raw_inode, ei); + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); + if (ext4_has_feature_64bit(sb)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + inode->i_size = ext4_isize(sb, raw_inode); + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { + ext4_error_inode(inode, function, line, 0, + "iget: bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + /* + * If dir_index is not enabled but there's dir with INDEX flag set, + * we'd normally treat htree data as empty space. But with metadata + * checksumming that corrupts checksums so forbid that. + */ + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + ext4_error_inode(inode, function, line, 0, + "iget: Dir with htree data on filesystem without dir_index feature."); + ret = -EFSCORRUPTED; + goto bad_inode; + } + ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT4_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + ext4_fc_init_inode(&ei->vfs_inode); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); + ei->i_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + } else { + ret = ext4_iget_extra_inode(inode, raw_inode, ei); + if (ret) + goto bad_inode; + } + } + + EXT4_INODE_GET_CTIME(inode, raw_inode); + EXT4_INODE_GET_ATIME(inode, raw_inode); + EXT4_INODE_GET_MTIME(inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + u64 ivers = le32_to_cpu(raw_inode->i_disk_version); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + ivers |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } + ext4_inode_set_iversion_queried(inode, ivers); + } + + ret = 0; + if (ei->i_file_acl && + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { + ext4_error_inode(inode, function, line, 0, + "iget: bad extended attribute block %llu", + ei->i_file_acl); + ret = -EFSCORRUPTED; + goto bad_inode; + } else if (!ext4_has_inline_data(inode)) { + /* validate the block references in the inode */ + if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_check_inode(inode); + else + ret = ext4_ind_check_inode(inode); + } + } + if (ret) + goto bad_inode; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + /* VFS does not allow setting these so must be corruption */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ext4_error_inode(inode, function, line, 0, + "iget: immutable or append flags " + "not allowed on symlinks"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (IS_ENCRYPTED(inode)) { + inode->i_op = &ext4_encrypted_symlink_inode_operations; + } else if (ext4_inode_is_fast_symlink(inode)) { + inode->i_op = &ext4_fast_symlink_inode_operations; + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); + } else { + inode->i_op = &ext4_symlink_inode_operations; + } + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &ext4_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); + } else { + ret = -EFSCORRUPTED; + ext4_error_inode(inode, function, line, 0, + "iget: bogus i_mode (%o)", inode->i_mode); + goto bad_inode; + } + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { + ext4_error_inode(inode, function, line, 0, + "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + + ext4_set_inode_mapping_order(inode); + + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); + } + if (ret) + goto bad_inode; + brelse(iloc.bh); + + unlock_new_inode(inode); + return inode; + +bad_inode: + brelse(iloc.bh); + iget_failed(inode); + return ERR_PTR(ret); +} + +static void __ext4_update_other_inode_time(struct super_block *sb, + unsigned long orig_ino, + unsigned long ino, + struct ext4_inode *raw_inode) +{ + struct inode *inode; + + inode = find_inode_by_ino_rcu(sb, ino); + if (!inode) + return; + + if (!inode_is_dirtytime_only(inode)) + return; + + spin_lock(&inode->i_lock); + if (inode_is_dirtytime_only(inode)) { + struct ext4_inode_info *ei = EXT4_I(inode); + + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + + spin_lock(&ei->i_raw_lock); + EXT4_INODE_SET_CTIME(inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); + ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + trace_ext4_other_inode_update_time(inode, orig_ino); + return; + } + spin_unlock(&inode->i_lock); +} + +/* + * Opportunistically update the other time fields for other inodes in + * the same inode table block. + */ +static void ext4_update_other_inodes_time(struct super_block *sb, + unsigned long orig_ino, char *buf) +{ + unsigned long ino; + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int inode_size = EXT4_INODE_SIZE(sb); + + /* + * Calculate the first inode in the inode table block. Inode + * numbers are one-based. That is, the first inode in a block + * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). + */ + ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; + rcu_read_lock(); + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { + if (ino == orig_ino) + continue; + __ext4_update_other_inode_time(sb, orig_ino, ino, + (struct ext4_inode *)buf); + } + rcu_read_unlock(); +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; + int err; + int need_datasync = 0, set_large_file = 0; + + spin_lock(&ei->i_raw_lock); + + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) + need_datasync = 1; + if (ei->i_disksize > 0x7fffffffULL) { + if (!ext4_has_feature_large_file(sb) || + EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; + } + + err = ext4_fill_raw_inode(inode, raw_inode); + spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; + } + + if (inode->i_sb->s_flags & SB_LAZYTIME) + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, + bh->b_data); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_error; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, sb, + EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out_error; + lock_buffer(EXT4_SB(sb)->s_sbh); + ext4_set_feature_large_file(sb); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); + ext4_handle_sync(handle); + err = ext4_handle_dirty_metadata(handle, NULL, + EXT4_SB(sb)->s_sbh); + } + ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); +out_brelse: + brelse(bh); + return err; +} + +/* + * ext4_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * transaction to commit. + * + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. + * + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. + */ +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err; + + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + return 0; + + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; + + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + ext4_debug("called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) + return 0; + + err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } else { + struct ext4_iloc iloc; + + err = __ext4_get_inode_loc_noinmem(inode, &iloc); + if (err) + return err; + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, + "IO error syncing inode"); + err = -EIO; + } + brelse(iloc.bh); + } + return err; +} + +/* + * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate + * buffers that are attached to a folio straddling i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ + unsigned offset; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid; + int ret; + bool has_transaction; + + offset = inode->i_size & (PAGE_SIZE - 1); + /* + * If the folio is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidate_folio() may + * strip all buffers from the folio but keep the folio dirty which can then + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without + * buffers). Also we don't need to wait for any commit if all buffers in + * the folio remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. + */ + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) + return; + while (1) { + struct folio *folio = filemap_lock_folio(inode->i_mapping, + inode->i_size >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + ret = __ext4_journalled_invalidate_folio(folio, offset, + folio_size(folio) - offset); + folio_unlock(folio); + folio_put(folio); + if (ret != -EBUSY) + return; + has_transaction = false; + read_lock(&journal->j_state_lock); + if (journal->j_committing_transaction) { + commit_tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } + read_unlock(&journal->j_state_lock); + if (has_transaction) + jbd2_log_wait_commit(journal, commit_tid); + } +} + +/* + * ext4_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_rwsem down. + */ +int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error, rc = 0; + int orphan = 0; + const unsigned int ia_valid = attr->ia_valid; + bool inc_ivers = true; + + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + + error = setattr_prepare(idmap, dentry, attr); + if (error) + return error; + + error = fscrypt_prepare_setattr(dentry, attr); + if (error) + return error; + + error = fsverity_prepare_setattr(dentry, attr); + if (error) + return error; + + if (is_quota_modification(idmap, inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } + + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + /* dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); + error = dquot_transfer(idmap, inode, attr); + up_read(&EXT4_I(inode)->xattr_sem); + + if (error) { + ext4_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + if (unlikely(error)) { + return error; + } + } + + if (attr->ia_valid & ATTR_SIZE) { + handle_t *handle; + loff_t oldsize = inode->i_size; + loff_t old_disksize; + int shrink = (attr->ia_size < inode->i_size); + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) { + return -EFBIG; + } + } + if (!S_ISREG(inode->i_mode)) { + return -EINVAL; + } + + if (attr->ia_size == inode->i_size) + inc_ivers = false; + + if (shrink) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) + goto err_out; + } + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. + */ + inode_dio_wait(inode); + } + + filemap_invalidate_lock(inode->i_mapping); + + rc = ext4_break_layouts(inode); + if (rc) { + filemap_invalidate_unlock(inode->i_mapping); + goto err_out; + } + + if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto out_mmap_sem; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_mmap_sem; + } + if (ext4_handle_valid(handle) && shrink) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } + /* + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. + */ + if (!shrink) { + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } + + if (shrink) + ext4_fc_track_range(handle, inode, + (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> + inode->i_sb->s_blocksize_bits, + EXT_MAX_BLOCKS - 1); + else + ext4_fc_track_range( + handle, inode, + (oldsize > 0 ? oldsize - 1 : oldsize) >> + inode->i_sb->s_blocksize_bits, + (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> + inode->i_sb->s_blocksize_bits); + + down_write(&EXT4_I(inode)->i_data_sem); + old_disksize = EXT4_I(inode)->i_disksize; + EXT4_I(inode)->i_disksize = attr->ia_size; + + /* + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). + */ + if (!error) + i_size_write(inode, attr->ia_size); + else + EXT4_I(inode)->i_disksize = old_disksize; + up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext4_journal_stop(handle); + if (error) + goto out_mmap_sem; + if (!shrink) { + pagecache_isize_extended(inode, oldsize, + inode->i_size); + } else if (ext4_should_journal_data(inode)) { + ext4_wait_for_tail_page_commit(inode); + } + } + + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, inode->i_size); + /* + * Call ext4_truncate() even if i_size didn't change to + * truncate possible preallocated blocks. + */ + if (attr->ia_size <= oldsize) { + rc = ext4_truncate(inode); + if (rc) + error = rc; + } +out_mmap_sem: + filemap_invalidate_unlock(inode->i_mapping); + } + + if (!error) { + if (inc_ivers) + inode_inc_iversion(inode); + setattr_copy(idmap, inode, attr); + mark_inode_dirty(inode); + } + + /* + * If the call to ext4_truncate failed to get a transaction handle at + * all, we need to clean up the in-core orphan list manually. + */ + if (orphan && inode->i_nlink) + ext4_orphan_del(NULL, inode); + + if (!error && (ia_valid & ATTR_MODE)) + rc = posix_acl_chmod(idmap, dentry, inode->i_mode); + +err_out: + if (error) + ext4_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + +u32 ext4_dio_alignment(struct inode *inode) +{ + if (fsverity_active(inode)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + if (ext4_has_inline_data(inode)) + return 0; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return 0; + return i_blocksize(inode); + } + return 1; /* use the iomap defaults */ +} + +int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int flags; + + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = ei->i_crtime.tv_sec; + stat->btime.tv_nsec = ei->i_crtime.tv_nsec; + } + + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + u32 dio_align = ext4_dio_alignment(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (dio_align == 1) { + struct block_device *bdev = inode->i_sb->s_bdev; + + /* iomap defaults */ + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } else { + stat->dio_mem_align = dio_align; + stat->dio_offset_align = dio_align; + } + } + + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); + } + + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + if (flags & EXT4_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & EXT4_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (flags & EXT4_ENCRYPT_FL) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & EXT4_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & EXT4_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + if (flags & EXT4_VERITY_FL) + stat->attributes |= STATX_ATTR_VERITY; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +int ext4_file_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + u64 delalloc_blocks; + + ext4_getattr(idmap, path, stat, request_mask, query_flags); + + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(ext4_has_inline_data(inode))) + stat->blocks += (stat->size + 511) >> 9; + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), + EXT4_I(inode)->i_reserved_data_blocks); + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); + return 0; +} + +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) +{ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); +} + +/* + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiguous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) +{ + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; + int idxblocks; + int ret; + + /* + * How many index and leaf blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? + */ + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks + pextents; + gdpblocks = groups; + if (groups > ngroups) + groups = ngroups; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret = idxblocks + groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. + */ +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) +{ + int ret; + + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); + /* Account for data blocks for journalled mode */ + if (ext4_should_journal_data(inode)) + ret += nrblocks; + return ret; +} + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* + * The caller must have previously called ext4_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext4_iloc *iloc) +{ + int err = 0; + + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { + put_bh(iloc->bh); + return err; + } + ext4_fc_track_inode(handle, inode); + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ + err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc) +{ + int err; + + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + iloc->bh, EXT4_JTR_NONE); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + ext4_fc_track_inode(handle, inode); + } + ext4_std_error(inode->i_sb, err); + return err; +} + +static int __ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc, + handle_t *handle, int *no_expand) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + int error; + + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + + raw_inode = ext4_raw_inode(iloc); + + header = IHDR(inode, raw_inode); + + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* + * We may need to allocate external xattr block so we need quotas + * initialized. Here we can be called with various locks held so we + * cannot affort to initialize quotas ourselves. So just bail. + */ + if (dquot_initialize_needed(inode)) + return -EAGAIN; + + /* try to expand with EAs present */ + error = ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); + if (error) { + /* + * Inode size expansion failed; don't try again + */ + *no_expand = 1; + } + + return error; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_try_to_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + int no_expand; + int error; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) + return -EOVERFLOW; + + /* + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) + return -ENOSPC; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return -EBUSY; + + error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, + handle, &no_expand); + ext4_write_unlock_xattr(inode, &no_expand); + + return error; +} + +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc) +{ + handle_t *handle; + int no_expand; + int error, rc; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + brelse(iloc->bh); + return -EOVERFLOW; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + brelse(iloc->bh); + return error; + } + + ext4_write_lock_xattr(inode, &no_expand); + + BUFFER_TRACE(iloc->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, + EXT4_JTR_NONE); + if (error) { + brelse(iloc->bh); + goto out_unlock; + } + + error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, + handle, &no_expand); + + rc = ext4_mark_iloc_dirty(handle, inode, iloc); + if (!error) + error = rc; + +out_unlock: + ext4_write_unlock_xattr(inode, &no_expand); + ext4_journal_stop(handle); + return error; +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + */ +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line) +{ + struct ext4_iloc iloc; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int err; + + might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) + ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, + iloc, handle); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out: + if (unlikely(err)) + ext4_error_inode_err(inode, func, line, 0, err, + "mark_inode_dirty error"); + return err; +} + +/* + * ext4_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_block() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext4_dirty_inode(struct inode *inode, int flags) +{ + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return; + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +} + +int ext4_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + int alloc_ctx; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; + if (is_journal_aborted(journal)) + return -EROFS; + + /* Wait for all existing dio workers */ + inode_dio_wait(inode); + + /* + * Before flushing the journal and switching inode's aops, we have + * to flush all dirty data the inode has. There can be outstanding + * delayed allocations, there can be unwritten extents created by + * fallocate or buffered writes in dioread_nolock mode covered by + * dirty data which can be converted only after flushing the dirty + * data (and journalled aops don't know how to handle these cases). + */ + if (val) { + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; + } + } + + alloc_ctx = ext4_writepages_down_write(inode->i_sb); + jbd2_journal_lock_updates(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + else { + err = jbd2_journal_flush(journal, 0); + if (err < 0) { + jbd2_journal_unlock_updates(journal); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); + return err; + } + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); + + if (val) + filemap_invalidate_unlock(inode->i_mapping); + + /* Finally we can mark the inode as dirty. */ + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); + err = ext4_mark_inode_dirty(handle, inode); + ext4_handle_sync(handle); + ext4_journal_stop(handle); + ext4_std_error(inode->i_sb, err); + + return err; +} + +static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int credits; + int ret; + + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = page_folio(vmf->page); + loff_t size; + unsigned long len; + int err; + vm_fault_t ret; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + get_block_t *get_block = ext4_get_block; + int retries = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + + filemap_invalidate_lock_shared(mapping); + + err = ext4_convert_inline_data(inode); + if (err) + goto out_ret; + + /* + * On data journalling we skip straight to the transaction handle: + * there's no delalloc; page truncated will be checked later; the + * early return w/ all buffers mapped (calculates size/len) can't + * be used; and there's no dioread_nolock, so only ext4_get_block. + */ + if (ext4_should_journal_data(inode)) + goto retry_alloc; + + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_nonda_switch(inode->i_sb)) { + do { + err = block_page_mkwrite(vma, vmf, + ext4_da_get_block_prep); + } while (err == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)); + goto out_ret; + } + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != mapping || folio_pos(folio) > size) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + /* + * Return if we have all the buffers mapped. This avoids the need to do + * journal_start/journal_stop which can block and take a long time + * + * This cannot be done for data journalling, as we have to add the + * inode to the transaction's list to writeprotect pages on commit. + */ + if (folio_buffers(folio)) { + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), + 0, len, NULL, + ext4_bh_unmapped)) { + /* Wait so that we don't change page under IO */ + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; + goto out; + } + } + folio_unlock(folio); + /* OK, we need to fill the hole... */ + if (ext4_should_dioread_nolock(inode)) + get_block = ext4_get_block_unwritten; +retry_alloc: + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) + goto retry_alloc; +out_ret: + ret = vmf_fs_error(err); +out: + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(inode->i_sb); + return ret; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy mballoc.c from Linux v6.18 fs/ext4 directory. This file implements the multiblock allocator for efficient block allocation. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/mballoc.c | 7175 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 7175 insertions(+) create mode 100644 fs/ext4l/mballoc.c diff --git a/fs/ext4l/mballoc.c b/fs/ext4l/mballoc.c new file mode 100644 index 00000000000..9087183602e --- /dev/null +++ b/fs/ext4l/mballoc.c @@ -0,0 +1,7175 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include "ext4_jbd2.h" +#include "mballoc.h" +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/nospec.h> +#include <linux/backing-dev.h> +#include <linux/freezer.h> +#include <trace/events/ext4.h> +#include <kunit/static_stub.h> + +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in + * terms of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small files closer together on the disk. + * + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list represented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) within the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is + * 512 blocks. This can be tuned via + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in + * terms of number of blocks. If we have mounted the file system with -O + * stripe=<value> option the group prealloc request is normalized to the + * smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. + * + * If "mb_optimize_scan" mount option is set, we maintain in memory group info + * structures in two data structures: + * + * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) + * + * Locking: Writers use xa_lock, readers use rcu_read_lock. + * + * This is an array of xarrays where the index in the array represents the + * largest free order in the buddy bitmap of the participating group infos of + * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of xarrays. Group-infos are + * placed in appropriate xarrays. + * + * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) + * + * Locking: Writers use xa_lock, readers use rcu_read_lock. + * + * This is an array of xarrays where in the i-th xarray there are groups with + * average fragment size >= 2^i and < 2^(i+1). The average fragment size + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. + * Note that we don't bother with a special xarray for completely empty + * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed + * in appropriate xarrays. + * + * In xarray, the index is the block group number, the value is the block group + * information, and a non-empty value indicates the block group is present in + * the current xarray. + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for + * fulfilling an allocation request. + * + * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order + * >= the order of the request. We directly look at the largest free order list + * in the data structure (1) above where largest_free_order = order of the + * request. If that list is empty, we look at remaining list in the increasing + * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED + * lookup in O(1) time. + * + * At CR_GOAL_LEN_FAST, we only consider groups where + * average fragment size > request size. So, we lookup a group which has average + * fragment size just above or equal to request size using our average fragment + * size group lists (data structure 2) in O(1) time. + * + * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied + * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in + * CR_GOAL_LEN_FAST suggests that there is no BG that has avg + * fragment size > goal length. So before falling to the slower + * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and + * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big + * enough average fragment size. This increases the chances of finding a + * suitable block group in O(1) time and results in faster allocation at the + * cost of reduced size of allocation. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and + * CR_GOAL_LEN_FAST phase. + * + * The regular allocator (using the buddy cache) supports a few tunables. + * + * /sys/fs/ext4/<partition>/mb_min_to_scan + * /sys/fs/ext4/<partition>/mb_max_to_scan + * /sys/fs/ext4/<partition>/mb_order2_req + * /sys/fs/ext4/<partition>/mb_max_linear_groups + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contiguous block in + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scan indicates how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can be used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not + * get traversed linearly. That may result in subsequent allocations being not + * close to each other. And so, the underlying device may get filled up in a + * non-linear fashion. While that may not matter on non-rotational devices, for + * rotational devices that may result in higher seek times. "mb_max_linear_groups" + * tells mballoc how many groups mballoc should search linearly before + * performing consulting above data structures for more efficient lookups. For + * non rotational devices, this value defaults to 0 and for rotational devices + * this is set to MB_DEFAULT_LINEAR_LIMIT. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * - cr_power2_aligned lists lock (cr_power2_aligned) + * - cr_goal_len_fast lists lock (cr_goal_len_fast) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + * - allocation path (ext4_mb_regular_allocator) + * group + * cr_power2_aligned/cr_goal_len_fast + */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_data_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group); + +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks); + +/* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block + * allocation in ext4_mb_new_blocks(). + * 2. We increment this percpu discard_pa_seq counter when we either allocate + * or free these blocks i.e. while marking those blocks as used/free in + * mb_mark_used()/mb_free_blocks(). + * 3. We also increment this percpu seq counter when we successfully identify + * that the bb_prealloc_list is not empty and hence proceed for discarding + * of those PAs inside ext4_mb_discard_group_preallocations(). + * + * Now to make sure that the regular fast path of block allocation is not + * affected, as a small optimization we only sample the percpu seq counter + * on that cpu. Only when the block allocation fails and when freed blocks + * found were 0, that is when we sample percpu seq counter for all cpus using + * below function ext4_get_discard_pa_seq_sum(). This happens after making + * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. + */ +static DEFINE_PER_CPU(u64, discard_pa_seq); +static inline u64 ext4_get_discard_pa_seq_sum(void) +{ + int __cpu; + u64 __seq = 0; + + for_each_possible_cpu(__cpu) + __seq += per_cpu(discard_pa_seq, __cpu); + return __seq; +} + +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline int mb_test_bit(int bit, void *addr) +{ + /* + * ext4_test_bit on architecture like powerpc + * needs unsigned long aligned address + */ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +{ + char *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(max == NULL); + + if (order > e4b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return e4b->bd_bitmap; + } + + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef DOUBLE_CHECK +static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int i; + struct super_block *sb = e4b->bd_sb; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); + } + mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) +{ + int i; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); + mb_set_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { + unsigned char *b1, *b2; + int i; + b1 = (unsigned char *) e4b->bd_info->bb_bitmap; + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } + } +} + +static void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + struct buffer_head *bh; + + grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!grp->bb_bitmap) + return; + + bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR_OR_NULL(bh)) { + kfree(grp->bb_bitmap); + grp->bb_bitmap = NULL; + return; + } + + memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); + put_bh(bh); +} + +static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + kfree(grp->bb_bitmap); +} + +#else +static inline void mb_free_blocks_double(struct inode *inode, + struct ext4_buddy *e4b, int first, int count) +{ + return; +} +static inline void mb_mark_used_double(struct ext4_buddy *e4b, + int first, int count) +{ + return; +} +static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + return; +} + +static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + return; +} + +static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + return; +} +#endif + +#ifdef AGGRESSIVE_CHECK + +#define MB_CHECK_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + function, file, line, # assert); \ + BUG(); \ + } \ +} while (0) + +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, + const char *function, int line) +{ + struct super_block *sb = e4b->bd_sb; + int order = e4b->bd_blkbits + 1; + int max; + int max2; + int i; + int j; + int k; + int count; + struct ext4_group_info *grp; + int fragments = 0; + int fstart; + struct list_head *cur; + void *buddy; + void *buddy2; + + if (e4b->bd_info->bb_check_counter++ % 10) + return; + + while (order > 1) { + buddy = mb_find_buddy(e4b, order, &max); + MB_CHECK_ASSERT(buddy); + buddy2 = mb_find_buddy(e4b, order - 1, &max2); + MB_CHECK_ASSERT(buddy2); + MB_CHECK_ASSERT(buddy != buddy2); + MB_CHECK_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 0 */ + if (!mb_test_bit(i << 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit((i<<1)+1, buddy2)); + } + continue; + } + + /* both bits in buddy2 must be 1 */ + MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); + MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + MB_CHECK_ASSERT( + !mb_test_bit(k, e4b->bd_bitmap)); + } + count++; + } + MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e4b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e4b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + } + } + MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); + if (!grp) + return; + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); + MB_CHECK_ASSERT(groupnr == e4b->bd_group); + for (i = 0; i < pa->pa_len; i++) + MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); + } +} +#undef MB_CHECK_ASSERT +#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ + __FILE__, __func__, __LINE__) +#else +#define mb_check_buddy(e4b) +#endif + +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; + unsigned int border; + + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fls(len) - 1; + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) + mb_clear_bit(first >> min, + buddy + sbi->s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + +static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) +{ + int order; + + /* + * We don't bother with a special lists groups with only 1 block free + * extents and for completely empty groups. + */ + order = fls(len) - 2; + if (order < 0) + return 0; + if (order == MB_NUM_ORDERS(sb)) + order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; + return order; +} + +/* Move group to appropriate avg_fragment_size list */ +static void +mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int new, old; + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) + return; + + old = grp->bb_avg_fragment_size_order; + new = grp->bb_fragments == 0 ? -1 : + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); + if (new == old) + return; + + if (old >= 0) + xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); + + grp->bb_avg_fragment_size_order = new; + if (new >= 0) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", + grp->bb_group, new, err); + } +} + +static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, + struct xarray *xa, + ext4_group_t start, ext4_group_t end) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + enum criteria cr = ac->ac_criteria; + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned long group = start; + struct ext4_group_info *grp; + + if (WARN_ON_ONCE(end > ngroups || start >= end)) + return 0; + + xa_for_each_range(xa, group, grp, start, end - 1) { + int err; + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + + err = ext4_mb_scan_group(ac, grp->bb_group); + if (err || ac->ac_status != AC_STATUS_CONTINUE) + return err; + + cond_resched(); + } + + return 0; +} + +/* + * Find a suitable group of given order from the largest free orders xarray. + */ +static inline int +ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) +{ + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; + + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); +} + +/* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. + */ +static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int i; + int ret = 0; + ext4_group_t start, end; + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; + } + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + + /* Increment cr and search again if no group is found */ + ac->ac_criteria = CR_GOAL_LEN_FAST; + return ret; +} + +/* + * Find a suitable group of given order from the average fragments xarray. + */ +static int +ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) +{ + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; + + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); +} + +/* + * Choose next group by traversing average fragment size list of suitable + * order. Updates *new_cr if cr level needs an update. + */ +static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int i, ret = 0; + ext4_group_t start, end; + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; + } + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) + ac->ac_criteria = CR_BEST_AVAIL_LEN; + else + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; +} + +/* + * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment + * order we have and proactively trim the goal request length to that order to + * find a suitable group faster. + * + * This optimizes allocation speed at the cost of slightly reduced + * preallocations. However, we make sure that we don't trim the request too + * much and fall to CR_GOAL_LEN_SLOW in that case. + */ +static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + int ret = 0; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int i, order, min_order; + unsigned long num_stripe_clusters = 0; + ext4_group_t start, end; + + /* + * mb_avg_fragment_size_order() returns order in a way that makes + * retrieving back the length using (1 << order) inaccurate. Hence, use + * fls() instead since we need to know the actual length while modifying + * goal length. + */ + order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); + min_order = order - sbi->s_mb_best_avail_max_trim_order; + if (min_order < 0) + min_order = 0; + + if (sbi->s_stripe > 0) { + /* + * We are assuming that stripe size is always a multiple of + * cluster ratio otherwise __ext4_fill_super exists early. + */ + num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); + if (1 << min_order < num_stripe_clusters) + /* + * We consider 1 order less because later we round + * up the goal len to num_stripe_clusters + */ + min_order = fls(num_stripe_clusters) - 1; + } + + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len); + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + for (i = order; i >= min_order; i--) { + int frag_order; + /* + * Scale down goal len to make sure we find something + * in the free fragments list. Basically, reduce + * preallocations. + */ + ac->ac_g_ex.fe_len = 1 << i; + + if (num_stripe_clusters > 0) { + /* + * Try to round up the adjusted goal length to + * stripe size (in cluster units) multiple for + * efficiency. + */ + ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, + num_stripe_clusters); + } + + frag_order = mb_avg_fragment_size_order(ac->ac_sb, + ac->ac_g_ex.fe_len); + + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; + } + + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; +} + +static inline int should_optimize_scan(struct ext4_allocation_context *ac) +{ + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) + return 0; + if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) + return 0; + if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; +} + +/* + * next linear group for allocation. + */ +static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) +{ + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + *group = *group + 1 >= ngroups ? 0 : *group + 1; +} + +static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, + ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) +{ + int ret, i; + enum criteria cr = ac->ac_criteria; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group = *start; + + for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { + ret = ext4_mb_scan_group(ac, group); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + cond_resched(); + } + + *start = group; + if (count == ngroups) + ac->ac_criteria++; + + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + return 0; +} + +static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) +{ + int ret = 0; + ext4_group_t start; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + /* searching for the right group start from the goal value specified */ + start = ac->ac_g_ex.fe_group; + ac->ac_prefetch_grp = start; + ac->ac_prefetch_nr = 0; + + if (!should_optimize_scan(ac)) + return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (sbi->s_mb_max_linear_groups) + ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, + sbi->s_mb_max_linear_groups); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + + switch (ac->ac_criteria) { + case CR_POWER2_ALIGNED: + return ext4_mb_scan_groups_p2_aligned(ac, start); + case CR_GOAL_LEN_FAST: + return ext4_mb_scan_groups_goal_fast(ac, start); + case CR_BEST_AVAIL_LEN: + return ext4_mb_scan_groups_best_avail(ac, start); + default: + /* + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. + */ + WARN_ON(1); + } + + return 0; +} + +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int new, old = grp->bb_largest_free_order; + + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) + if (grp->bb_counters[new] > 0) + break; + + /* No need to move between order lists? */ + if (new == old) + return; + + if (old >= 0) { + struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; + + if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) + xa_erase(xa, grp->bb_group); + } + + grp->bb_largest_free_order = new; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_largest_free_orders[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", + grp->bb_group, new, err); + } +} + +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; + unsigned free = 0; + unsigned fragments = 0; + unsigned long long period = get_cycles(); + + /* initialize buddy from bitmap which is aggregation + * of on-disk bitmap and preallocations */ + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = mb_find_next_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext4_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { + ext4_grp_locked_error(sb, group, 0, 0, + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", + free, grp->bb_free); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + } + mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +} + +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + +/* The buddy information is attached the buddy cache inode + * for convenience. The information regarding each group + * is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are + * stored in the inode as + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. + * So for each group we take up 2 blocks. A page can + * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! + */ + +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) +{ + ext4_group_t ngroups; + unsigned int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group, group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh = NULL; + struct inode *inode; + char *data; + char *bitmap; + struct ext4_group_info *grinfo; + + inode = folio->mapping->host; + sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); + blocksize = i_blocksize(inode); + blocks_per_page = PAGE_SIZE / blocksize; + + mb_debug(sb, "init folio %lu\n", folio->index); + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kzalloc(i, gfp); + if (bh == NULL) + return -ENOMEM; + } else + bh = &bhs; + + first_group = folio->index * blocks_per_page / 2; + + /* read all groups the folio covers into the cache */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) + break; + + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + continue; + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the folio, + * which may be currently in use by an allocating task. + */ + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); + if (IS_ERR(bh[i])) { + err = PTR_ERR(bh[i]); + bh[i] = NULL; + goto out; + } + mb_debug(sb, "read bitmap for group %u\n", group); + } + + /* wait for I/O completion */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + int err2; + + if (!bh[i]) + continue; + err2 = ext4_wait_block_bitmap(sb, group, bh[i]); + if (!err) + err = err2; + } + + first_block = folio->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; + + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + + if (!buffer_verified(bh[group - first_group])) + /* Skip faulty bitmaps */ + continue; + err = 0; + + /* + * data carry information regarding this + * particular group in the format specified + * above + * + */ + data = folio_address(folio) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + /* + * We place the buddy block and bitmap block + * close together + */ + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } + if ((first_block + i) & 1) { + /* this is block of buddy */ + BUG_ON(incore == NULL); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * + (MB_NUM_ORDERS(sb))); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); + ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { + /* this is block of bitmap */ + BUG_ON(incore != NULL); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); + + /* see comments in ext4_mb_put_pa() */ + ext4_lock_group(sb, group); + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ + ext4_mb_generate_from_pa(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be + * generated using this + */ + incore = data; + } + } + folio_mark_uptodate(folio); + +out: + if (bh) { + for (i = 0; i < groups_per_page; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct folio *folio; + + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; + + blocks_per_page = PAGE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + /* blocks_per_page == 1, hence we need another page for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); + } + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) +{ + + struct ext4_group_info *this_grp; + struct ext4_buddy e4b; + struct folio *folio; + int ret = 0; + + might_sleep(); + mb_debug(sb, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); + if (!this_grp) + return -EFSCORRUPTED; + + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + goto err; + } + + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); + if (ret) + goto err; + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto err; + } + + if (e4b.bd_buddy_folio == NULL) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + ret = 0; + goto err; + } + /* init buddy cache */ + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); + if (ret) + goto err; + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto err; + } +err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack int +ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, gfp_t gfp) +{ + int blocks_per_page; + int block; + int pnum; + int poff; + struct folio *folio; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + + might_sleep(); + mb_debug(sb, "load group %u\n", group); + + blocks_per_page = PAGE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); + if (!grp) + return -EFSCORRUPTED; + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = grp; + e4b->bd_sb = sb; + e4b->bd_group = group; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; + + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) + return ret; + } + + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + /* + * drop the folio reference and try + * to get the folio with lock. If we + * are not uptodate that implies + * somebody just created the folio but + * is yet to initialize it. So + * wait for it to initialize. + */ + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { + /* should never happen */ + folio_unlock(folio); + ret = -EINVAL; + goto err; + } + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); + if (ret) { + folio_unlock(folio); + goto err; + } + mb_cmp_bitmaps(e4b, folio_address(folio) + + (poff * sb->s_blocksize)); + } + folio_unlock(folio); + } + } + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto err; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto err; + } + + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { + /* should never happen */ + folio_unlock(folio); + ret = -EINVAL; + goto err; + } + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, + gfp); + if (ret) { + folio_unlock(folio); + goto err; + } + } + folio_unlock(folio); + } + } + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto err; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto err; + } + + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); + + return 0; + +err: + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; + return ret; +} + +static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); +} + +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); +} + + +static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) +{ + int order = 1, max; + void *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); + + while (order <= e4b->bd_blkbits + 1) { + bb = mb_find_buddy(e4b, order, &max); + if (!mb_test_bit(block >> order, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + order++; + } + return 0; +} + +static void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit(cur, bm); + cur++; + } +} + +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + +void mb_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: set whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit(cur, bm); + cur++; + } +} + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + buddy2 = mb_find_buddy(e4b, order, &max); + if (!buddy2) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; + struct super_block *sb = e4b->bd_sb; + + if (WARN_ON(count == 0)) + return; + BUG_ON(last >= (sb->s_blocksize << 3)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ + if (first != 0) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(sbi, block); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; + } + + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) + e4b->bd_info->bb_fragments--; + else if (!left_is_free && !right_is_free) + e4b->bd_info->bb_fragments++; + + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } + + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); + + mb_set_largest_free_order(sb, e4b->bd_info); + mb_update_avg_fragment_size(sb, e4b->bd_info); +check: + mb_check_buddy(e4b); +} + +static int mb_find_extent(struct ext4_buddy *e4b, int block, + int needed, struct ext4_free_extent *ex) +{ + int max, order, next; + void *buddy; + + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + BUG_ON(ex == NULL); + + buddy = mb_find_buddy(e4b, 0, &max); + BUG_ON(buddy == NULL); + BUG_ON(block >= max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + + ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); + ex->fe_start = block; + ex->fe_group = e4b->bd_group; + + block = block >> order; + + while (needed > ex->fe_len && + mb_find_buddy(e4b, order, &max)) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, e4b->bd_bitmap)) + break; + + order = mb_find_order_for_block(e4b, next); + + block = next >> order; + ex->fe_len += 1 << order; + } + + if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { + /* Should never happen! (but apparently sometimes does?!?) */ + WARN_ON(1); + ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, + "corruption or bug in mb_find_extent " + "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", + block, order, needed, ex->fe_group, ex->fe_start, + ex->fe_len, ex->fe_logical); + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + } + return ex->fe_len; +} + +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + int ord; + int mlen = 0; + int max = 0; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + int ord_start, ord_end; + + BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); + BUG_ON(e4b->bd_group != ex->fe_group); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); + if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, e4b->bd_bitmap); + if (mlen && max) + e4b->bd_info->bb_fragments++; + else if (!mlen && !max) + e4b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e4b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e4b, ord, &max); + BUG_ON((start >> ord) >= max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + BUG_ON(len < 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + BUG_ON(ord <= 0); + buddy = mb_find_buddy(e4b, ord, &max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); + mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); + mb_check_buddy(e4b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int ret; + + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; + ret = mb_mark_used(e4b, &ac->ac_b_ex); + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); + /* store last allocated for subsequent stream allocation */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); + } + + /* + * As we've just preallocated more space than + * user requested originally, we store allocated + * space in a special descriptor. + */ + if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + +} + +static void ext4_mb_check_limits(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b, + int finish_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + ac->ac_status = AC_STATUS_BREAK; + return; + } + + /* + * Haven't found good chunk so far, let's continue + */ + if (bex->fe_len < gex->fe_len) + return; + + if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) + ext4_mb_use_best_found(ac, e4b); +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * The algorithm used is roughly as follows: + * + * * If free extent found is exactly as big as goal, then + * stop the scan and use it immediately + * + * * If free extent found is smaller than goal, then keep retrying + * upto a max of sbi->s_mb_max_to_scan times (default 200). After + * that stop scanning and use whatever we have. + * + * * If free extent found is bigger than goal, then keep retrying + * upto a max of sbi->s_mb_min_to_scan times (default 10) before + * stopping the scan and using the extent. + * + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, + struct ext4_free_extent *ex, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + BUG_ON(ex->fe_len <= 0); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); + + ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + if (ex->fe_len < bex->fe_len) + *bex = *ex; + } + + ext4_mb_check_limits(ac, e4b, 0); +} + +static noinline_for_stack +void ext4_mb_try_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex = ac->ac_b_ex; + ext4_group_t group = ex.fe_group; + int max; + int err; + + BUG_ON(ex.fe_len <= 0); + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return; + + ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + +out: + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); +} + +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + ext4_group_t group = ac->ac_g_ex.fe_group; + int max; + int err; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct ext4_free_extent ex; + + if (!grp) + return -EFSCORRUPTED; + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) + return 0; + if (grp->bb_free == 0) + return 0; + + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ + + if (max >= ac->ac_g_ex.fe_len && + ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { + ext4_fsblk_t start; + + start = ext4_grp_offs_to_block(ac->ac_sb, &ex); + /* use do_div to get remainder (would be 64-bit modulo) */ + if (do_div(start, sbi->s_stripe) == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } +out: + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_group_info *grp = e4b->bd_info; + void *buddy; + int i; + int k; + int max; + + BUG_ON(ac->ac_2order <= 0); + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e4b, i, &max); + if (WARN_RATELIMIT(buddy == NULL, + "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) + continue; + + k = mb_find_next_zero_bit(buddy, max, 0); + if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, + "%d free clusters of order %d. But found 0", + grp->bb_counters[i], i); + break; + } + ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e4b->bd_group; + + ext4_mb_use_best_found(ac, e4b); + + BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); + + if (EXT4_SB(sb)->s_mb_stats) + atomic_inc(&EXT4_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + int i, j, freelen; + int free; + + free = e4b->bd_info->bb_free; + if (WARN_ON(free <= 0)) + return; + + i = e4b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { + /* + * IF we have corrupt bitmap, we won't find any + * free blocks even though group info says we + * have free blocks + */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But bitmap says 0", + free); + break; + } + + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { + /* + * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are + * sure that this group will have a large enough + * continuous free extent, so skip over the smaller free + * extents + */ + j = mb_find_next_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + freelen = j - i; + + if (freelen < ac->ac_g_ex.fe_len) { + i = j; + free -= freelen; + continue; + } + } + + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); + if (WARN_ON(ex.fe_len <= 0)) + break; + if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But got %d blocks", + free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; + } + ex.fe_logical = 0xDEADC0DE; /* debug value */ + ext4_mb_measure_extent(ac, &ex, e4b); + + i += ex.fe_len; + free -= ex.fe_len; + } + + ext4_mb_check_limits(ac, e4b, 1); +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size-multiple requests + */ +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + ext4_fsblk_t first_group_block; + ext4_fsblk_t a; + ext4_grpblk_t i, stripe; + int max; + + BUG_ON(sbi->s_stripe == 0); + + /* find first stripe-aligned block in group */ + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + + a = first_group_block + sbi->s_stripe - 1; + do_div(a, sbi->s_stripe); + i = (a * sbi->s_stripe) - first_group_block; + + stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); + i = EXT4_B2C(sbi, i); + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e4b, i, stripe, &ex); + if (max >= stripe) { + ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; + ex.fe_logical = 0xDEADF00D; /* debug value */ + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + break; + } + } + i += stripe; + } +} + +static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) +{ + bool is_stripe_aligned; + struct ext4_sb_info *sbi; + enum criteria cr = ac->ac_criteria; + + ac->ac_groups_scanned++; + if (cr == CR_POWER2_ALIGNED) + return ext4_mb_simple_scan_group(ac, ac->ac_e4b); + + sbi = EXT4_SB(ac->ac_sb); + is_stripe_aligned = false; + if ((sbi->s_stripe >= sbi->s_cluster_ratio) && + !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) + is_stripe_aligned = true; + + if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, ac->ac_e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, ac->ac_e4b); +} + +/* + * This is also called BEFORE we load the buddy bitmap. + * Returns either 1 or 0 indicating that the group is either suitable + * for the allocation or not. + */ +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, enum criteria cr) +{ + ext4_grpblk_t free, fragments; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); + + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return false; + + free = grp->bb_free; + if (free == 0) + return false; + + fragments = grp->bb_fragments; + if (fragments == 0) + return false; + + switch (cr) { + case CR_POWER2_ALIGNED: + BUG_ON(ac->ac_2order == 0); + + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return false; + + if (free < ac->ac_g_ex.fe_len) + return false; + + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) + return true; + + if (grp->bb_largest_free_order < ac->ac_2order) + return false; + + return true; + case CR_GOAL_LEN_FAST: + case CR_BEST_AVAIL_LEN: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return true; + break; + case CR_GOAL_LEN_SLOW: + if (free >= ac->ac_g_ex.fe_len) + return true; + break; + case CR_ANY_FREE: + return true; + default: + BUG(); + } + + return false; +} + +/* + * This could return negative error code if something goes wrong + * during ext4_mb_init_group(). This should not be called with + * ext4_lock_group() held. + * + * Note: because we are conditionally operating with the group lock in + * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this + * function using __acquire and __release. This means we need to be + * super careful before messing with the error path handling via "goto + * out"! + */ +static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, + ext4_group_t group, enum criteria cr) +{ + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; + ext4_grpblk_t free; + int ret = 0; + + if (!grp) + return -EFSCORRUPTED; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); + if (should_lock) { + ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } + free = grp->bb_free; + if (free == 0) + goto out; + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) + goto out; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + goto out; + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); + ext4_unlock_group(sb, group); + } + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, group, NULL); + int ret; + + /* + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * search to find large good chunks almost for free. If buddy + * data is not ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, since this + * gets used for metadata block allocation, and we want to make + * sure we locate metadata blocks in the first block group in + * the flex_bg if possible. + */ + if (!ext4_mb_cr_expensive(cr) && + (!sbi->s_log_groups_per_flex || + ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) + return 0; + ret = ext4_mb_init_group(sb, group, GFP_NOFS); + if (ret) + return ret; + } + + if (should_lock) { + ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } + ret = ext4_mb_good_group(ac, group, cr); +out: + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); + ext4_unlock_group(sb, group); + } + return ret; +} + +/* + * Start prefetching @nr block bitmaps starting at @group. + * Return the next group which needs to be prefetched. + */ +ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, + unsigned int nr, int *cnt) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct buffer_head *bh; + struct blk_plug plug; + + blk_start_plug(&plug); + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, + NULL); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + /* + * Prefetch block groups with free blocks; but don't + * bother if it is marked uninitialized on disk, since + * it won't require I/O to read. Also only try to + * prefetch once, so we avoid getblk() call, which can + * be expensive. + */ + if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 ) { + bh = ext4_read_block_bitmap_nowait(sb, group, true); + if (bh && !IS_ERR(bh)) { + if (!buffer_uptodate(bh) && cnt) + (*cnt)++; + brelse(bh); + } + } + if (++group >= ngroups) + group = 0; + } + blk_finish_plug(&plug); + return group; +} + +/* + * Batch reads of the block allocation bitmaps to get + * multiple READs in flight; limit prefetching at inexpensive + * CR, otherwise mballoc can spend a lot of time loading + * imperfect groups + */ +static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi; + + if (ac->ac_prefetch_grp != group) + return; + + sbi = EXT4_SB(ac->ac_sb); + if (ext4_mb_cr_expensive(ac->ac_criteria) || + ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { + unsigned int nr = sbi->s_mb_prefetch; + + if (ext4_has_feature_flex_bg(ac->ac_sb)) { + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = umin(nr, sbi->s_mb_prefetch); + } + + ac->ac_prefetch_nr = nr; + ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, + &ac->ac_prefetch_ios); + } +} + +/* + * Prefetching reads the block bitmap into the buffer cache; but we + * need to make sure that the buddy bitmap in the page cache has been + * initialized. Note that ext4_mb_init_group() will block if the I/O + * is not yet completed, or indeed if it was not initiated by + * ext4_mb_prefetch did not start the I/O. + * + * TODO: We should actually kick off the buddy bitmap setup in a work + * queue when the buffer I/O is completed, so that we don't block + * waiting for the block allocation bitmap read to finish when + * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). + */ +void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr) +{ + struct ext4_group_desc *gdp; + struct ext4_group_info *grp; + + while (nr-- > 0) { + if (!group) + group = ext4_get_groups_count(sb); + group--; + gdp = ext4_get_group_desc(sb, group, NULL); + grp = ext4_get_group_info(sb, group); + + if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0) { + if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } + } +} + +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + int ret; + struct super_block *sb = ac->ac_sb; + enum criteria cr = ac->ac_criteria; + + ext4_mb_might_prefetch(ac, group); + + /* prevent unnecessary buddy loading. */ + if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) + return 0; + + /* This now checks without needing the buddy page */ + ret = ext4_mb_good_group_nolock(ac, group, cr); + if (ret <= 0) { + if (!ac->ac_first_err) + ac->ac_first_err = ret; + return 0; + } + + ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); + if (ret) + return ret; + + /* skip busy group */ + if (cr >= CR_ANY_FREE) + ext4_lock_group(sb, group); + else if (!ext4_try_lock_group(sb, group)) + goto out_unload; + + /* We need to check again after locking the block group. */ + if (unlikely(!ext4_mb_good_group(ac, group, cr))) + goto out_unlock; + + __ext4_mb_scan_group(ac); + +out_unlock: + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(ac->ac_e4b); + return ret; +} + +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +{ + ext4_group_t i; + int err = 0; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ + err = ext4_mb_find_by_goal(ac, &e4b); + if (err || ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + /* + * ac->ac_2order is set only if the fe_len is a power of 2 + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. + */ + i = fls(ac->ac_g_ex.fe_len); + ac->ac_2order = 0; + /* + * We search using buddy data only if the order of the request + * is greater than equal to the sbi_s_mb_order2_reqs + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req + * We also support searching for power-of-two requests only for + * requests upto maximum buddy size we have constructed. + */ + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { + if (is_power_of_2(ac->ac_g_ex.fe_len)) + ac->ac_2order = array_index_nospec(i - 1, + MB_NUM_ORDERS(sb)); + } + + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); + ac->ac_g_ex.fe_start = -1; + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; + } + + /* + * Let's just scan groups to find more-less suitable blocks We + * start with CR_GOAL_LEN_FAST, unless it is power of 2 + * aligned, in which case let's do that faster approach first. + */ + ac->ac_criteria = CR_GOAL_LEN_FAST; + if (ac->ac_2order) + ac->ac_criteria = CR_POWER2_ALIGNED; + + ac->ac_e4b = &e4b; + ac->ac_prefetch_ios = 0; + ac->ac_first_err = 0; +repeat: + while (ac->ac_criteria < EXT4_MB_NUM_CRS) { + err = ext4_mb_scan_groups(ac); + if (err) + goto out; + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + ext4_mb_try_best_found(ac, &e4b); + if (ac->ac_status != AC_STATUS_FOUND) { + int lost; + + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + */ + lost = atomic_inc_return(&sbi->s_mb_lost_chunks); + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, lost); + + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + ac->ac_criteria = CR_ANY_FREE; + goto repeat; + } + } + + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) + atomic_inc(&sbi->s_bal_stream_goals); + } +out: + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) + err = ac->ac_first_err; + + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, ac->ac_criteria, err); + + if (ac->ac_prefetch_nr) + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); + + return err; +} + +static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + ext4_group_t group; + + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + ext4_group_t group; + + ++*pos; + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i, err; + char nbuf[16]; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, + sb->s_blocksize_bits, + EXT4_MAX_BLOCK_LOG_SIZE); + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); + + group--; + if (group == 0) + seq_puts(seq, "#group: free frags first [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + + sizeof(struct ext4_group_info); + + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + return 0; + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); + return 0; + } + ext4_mb_unload_buddy(&e4b); + } + + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg->bb_counters[i] : 0); + seq_puts(seq, " ]"); + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) + seq_puts(seq, " Block bitmap corrupted!"); + seq_putc(seq, '\n'); + return 0; +} + +static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +const struct seq_operations ext4_mb_seq_groups_ops = { + .start = ext4_mb_seq_groups_start, + .next = ext4_mb_seq_groups_next, + .stop = ext4_mb_seq_groups_stop, + .show = ext4_mb_seq_groups_show, +}; + +int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + seq_puts(seq, "mballoc:\n"); + if (!sbi->s_mb_stats) { + seq_puts(seq, "\tmb stats collection turned off.\n"); + seq_puts( + seq, + "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + return 0; + } + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); + + seq_printf(seq, "\tgroups_scanned: %u\n", + atomic_read(&sbi->s_bal_groups_scanned)); + + /* CR_POWER2_ALIGNED stats */ + seq_puts(seq, "\tcr_p2_aligned_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); + + /* CR_GOAL_LEN_FAST stats */ + seq_puts(seq, "\tcr_goal_fast_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); + + /* CR_BEST_AVAIL_LEN stats */ + seq_puts(seq, "\tcr_best_avail_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); + + /* CR_GOAL_LEN_SLOW stats */ + seq_puts(seq, "\tcr_goal_slow_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); + + /* CR_ANY_FREE stats */ + seq_puts(seq, "\tcr_any_free_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); + + /* Aggregates */ + seq_printf(seq, "\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_ex_scanned)); + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tstream_goal_hits: %u\n", + atomic_read(&sbi->s_bal_stream_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", + atomic_read(&sbi->s_bal_len_goals)); + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); + seq_printf(seq, "\tbuddies_generated: %u/%u\n", + atomic_read(&sbi->s_mb_buddies_generated), + ext4_get_groups_count(sb)); + seq_printf(seq, "\tbuddies_time_used: %llu\n", + atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); + return 0; +} + +static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + unsigned long position; + + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + unsigned long position; + + ++*pos; + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = pde_data(file_inode(seq->file)); + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; + unsigned int count; + unsigned long idx; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { + position -= MB_NUM_ORDERS(sb); + if (position == 0) + seq_puts(seq, "avg_fragment_size_lists:\n"); + + count = 0; + xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + return 0; + } + + if (position == 0) { + seq_printf(seq, "optimize_scan: %d\n", + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; + xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + + return 0; +} + +static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +{ +} + +const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .start = ext4_mb_seq_structs_summary_start, + .next = ext4_mb_seq_structs_summary_next, + .stop = ext4_mb_seq_structs_summary_stop, + .show = ext4_mb_seq_structs_summary_show, +}; + +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***old_groupinfo, ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + rcu_read_lock(); + old_groupinfo = rcu_dereference(sbi->s_group_info); + if (old_groupinfo) + memcpy(new_groupinfo, old_groupinfo, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + rcu_read_unlock(); + rcu_assign_pointer(sbi->s_group_info, new_groupinfo); + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + if (old_groupinfo) + ext4_kvfree_array_rcu(old_groupinfo); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i; + int metalen = 0; + int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_NOFS); + if (meta_group_info == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate mem " + "for a buddy group"); + return -ENOMEM; + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; + rcu_read_unlock(); + } + + meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); + if (meta_group_info[i] == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + meta_group_info[i]->bb_free = + ext4_free_clusters_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + ext4_free_group_clusters(sb, desc); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; + + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + struct ext4_group_info ***group_info; + + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); + kfree(group_info[idx]); + group_info[idx] = NULL; + rcu_read_unlock(); + } + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +static int ext4_mb_init_backend(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + struct ext4_group_desc *desc; + struct ext4_group_info ***group_info; + struct kmem_cache *cachep; + + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; + + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + ext4_msg(sb, KERN_ERR, "can't get new inode"); + goto err_freesgi; + } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + cond_resched(); + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; + } + + if (ext4_has_feature_flex_bg(sb)) { + /* a single flex group is supposed to be read by a single IO. + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is + * unsigned integer, so the maximum shift is 32. + */ + if (sbi->s_es->s_log_groups_per_flex >= 32) { + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); + goto err_freebuddy; + } + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ + } else { + sbi->s_mb_prefetch = 32; + } + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch = ext4_get_groups_count(sb); + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. + * with an average random access time 5ms, it'd take a second to get + * 200 groups (* N with flex_bg), so let's make this limit 4 + */ + sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; + if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); + + return 0; + +err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); + while (i-- > 0) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + + if (grp) + kmem_cache_free(cachep, grp); + } + i = sbi->s_group_info_size; + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); + while (i-- > 0) + kfree(group_info[i]); + rcu_read_unlock(); + iput(sbi->s_buddy_cache); +err_freesgi: + rcu_read_lock(); + kvfree(rcu_dereference(sbi->s_group_info)); + rcu_read_unlock(); + return -ENOMEM; +} + +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + ext4_groupinfo_caches[cache_index] = cachep; + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +static void ext4_discard_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, + struct ext4_sb_info, s_discard_work); + struct super_block *sb = sbi->s_sb; + struct ext4_free_data *fd, *nfd; + struct ext4_buddy e4b; + LIST_HEAD(discard_list); + ext4_group_t grp, load_grp; + int err = 0; + + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_discard_list, &discard_list); + spin_unlock(&sbi->s_md_lock); + + load_grp = UINT_MAX; + list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { + /* + * If filesystem is umounting or no memory or suffering + * from no space, give up the discard + */ + if ((sb->s_flags & SB_ACTIVE) && !err && + !atomic_read(&sbi->s_retry_alloc_pending)) { + grp = fd->efd_group; + if (grp != load_grp) { + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); + + err = ext4_mb_load_buddy(sb, grp, &e4b); + if (err) { + kmem_cache_free(ext4_free_data_cachep, fd); + load_grp = UINT_MAX; + continue; + } else { + load_grp = grp; + } + } + + ext4_lock_group(sb, grp); + ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, + fd->efd_start_cluster + fd->efd_count - 1, 1); + ext4_unlock_group(sb, grp); + } + kmem_cache_free(ext4_free_data_cachep, fd); + } + + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); +} + +static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) +{ + if (!sbi->s_mb_avg_fragment_size) + return; + + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + + kfree(sbi->s_mb_avg_fragment_size); + sbi->s_mb_avg_fragment_size = NULL; +} + +static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) +{ + if (!sbi->s_mb_largest_free_orders) + return; + + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_largest_free_orders[i]); + + kfree(sbi->s_mb_largest_free_orders); + sbi->s_mb_largest_free_orders = NULL; +} + +int ext4_mb_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned i, j; + unsigned offset, offset_incr; + unsigned max; + int ret; + + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + ret = -ENOMEM; + goto out; + } + + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + offset_incr = 1 << (sb->s_blocksize_bits - 1); + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += offset_incr; + offset_incr = offset_incr >> 1; + max = max >> 1; + i++; + } while (i < MB_NUM_ORDERS(sb)); + + sbi->s_mb_avg_fragment_size = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_avg_fragment_size[i]); + + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_largest_free_orders[i]); + + spin_lock_init(&sbi->s_md_lock); + atomic_set(&sbi->s_mb_free_pending, 0); + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); + INIT_LIST_HEAD(&sbi->s_discard_list); + INIT_WORK(&sbi->s_discard_work, ext4_discard_work); + atomic_set(&sbi->s_retry_alloc_pending, 0); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; + + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); + } + + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), + DIV_ROUND_UP(sbi->s_groups_count, 4)); + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, + sizeof(ext4_group_t), GFP_KERNEL); + if (sbi->s_mb_last_groups == NULL) { + ret = -ENOMEM; + goto out; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { + ret = -ENOMEM; + goto out_free_last_groups; + } + for_each_possible_cpu(i) { + struct ext4_locality_group *lg; + lg = per_cpu_ptr(sbi->s_locality_groups, i); + mutex_init(&lg->lg_mutex); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); + spin_lock_init(&lg->lg_prealloc_lock); + } + + if (bdev_nonrot(sb->s_bdev)) + sbi->s_mb_max_linear_groups = 0; + else + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out_free_last_groups: + kfree(sbi->s_mb_last_groups); + sbi->s_mb_last_groups = NULL; +out: + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; + return ret; +} + +/* need to called with the ext4 group lock held */ +static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) +{ + struct ext4_prealloc_space *pa; + struct list_head *cur, *tmp; + int count = 0; + + list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + list_del(&pa->pa_group_list); + count++; + kmem_cache_free(ext4_pspace_cachep, pa); + } + return count; +} + +void ext4_mb_release(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + int num_meta_group_infos; + struct ext4_group_info *grinfo, ***group_info; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + int count; + + if (test_opt(sb, DISCARD)) { + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); + } + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { + cond_resched(); + grinfo = ext4_get_group_info(sb, i); + if (!grinfo) + continue; + mb_group_bb_bitmap_free(grinfo); + ext4_lock_group(sb, i); + count = ext4_mb_cleanup_pa(grinfo); + if (count) + mb_debug(sb, "mballoc: %d PAs left\n", + count); + ext4_unlock_group(sb, i); + kmem_cache_free(cachep, grinfo); + } + num_meta_group_infos = (ngroups + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); + for (i = 0; i < num_meta_group_infos; i++) + kfree(group_info[i]); + kvfree(group_info); + rcu_read_unlock(); + } + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_groups_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u generated and it took %llu", + atomic_read(&sbi->s_mb_buddies_generated), + atomic64_read(&sbi->s_mb_generation_time)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } + + free_percpu(sbi->s_locality_groups); + kfree(sbi->s_mb_last_groups); +} + +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t cluster, int count) +{ + ext4_fsblk_t discard_block; + + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); +} + +static void ext4_free_data_in_buddy(struct super_block *sb, + struct ext4_free_data *entry) +{ + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0; + + mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); + + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); + + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + */ + EXT4_MB_GRP_CLEAR_TRIMMED(db); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); + } + ext4_unlock_group(sb, entry->efd_group); + ext4_mb_unload_buddy(&e4b); + + mb_debug(sb, "freed %d blocks in 1 structures\n", count); +} + +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_free_data *entry, *tmp; + LIST_HEAD(freed_data_list); + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; + bool wake; + + list_replace_init(s_freed_head, &freed_data_list); + + list_for_each_entry(entry, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); + + if (test_opt(sb, DISCARD)) { + spin_lock(&sbi->s_md_lock); + wake = list_empty(&sbi->s_discard_list); + list_splice_tail(&freed_data_list, &sbi->s_discard_list); + spin_unlock(&sbi->s_md_lock); + if (wake) + queue_work(system_dfl_wq, &sbi->s_discard_work); + } else { + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + kmem_cache_free(ext4_free_data_cachep, entry); + } +} + +int __init ext4_init_mballoc(void) +{ + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); + if (ext4_pspace_cachep == NULL) + goto out; + + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); + if (ext4_ac_cachep == NULL) + goto out_pa_free; + + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) + goto out_ac_free; + + return 0; + +out_ac_free: + kmem_cache_destroy(ext4_ac_cachep); +out_pa_free: + kmem_cache_destroy(ext4_pspace_cachep); +out: + return -ENOMEM; +} + +void ext4_exit_mballoc(void) +{ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); +} + +#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 +#define EXT4_MB_SYNC_UPDATE 0x0002 +static int +ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + int err; + unsigned int i, already, changed = len; + + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, + handle, sb, state, group, blkoff, len, + flags, ret_changed); + + if (ret_changed) + *ret_changed = 0; + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) + return PTR_ERR(bitmap_bh); + + if (handle) { + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + if (handle) { + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + } + + if (state) { + mb_set_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) - changed); + } else { + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) + changed); + } + + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + if (ret_changed) + *ret_changed = changed; + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(changed, &fg->free_clusters); + else + atomic64_add(changed, &fg->free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + if (err) + goto out_err; + + if (flags & EXT4_MB_SYNC_UPDATE) { + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + } + +out_err: + brelse(bitmap_bh); + return err; +} + +/* + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps + * Returns 0 if success or error code + */ +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned int reserv_clstrs) +{ + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block; + int err, len; + int flags = 0; + ext4_grpblk_t changed; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); + if (!gdp) + return -EIO; + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, + ext4_free_group_clusters(sb, gdp)); + + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata", block, block+len); + /* File system mounted not to panic on error + * Fix the bitmap and return EFSCORRUPTED + * We leak some of the blocks here. + */ + err = ext4_mb_mark_context(handle, sb, true, + ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, + 0, NULL); + if (!err) + err = -EFSCORRUPTED; + return err; + } + +#ifdef AGGRESSIVE_CHECK + flags |= EXT4_MB_BITMAP_MARKED_CHECK; +#endif + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, + flags, &changed); + + if (err && changed == 0) + return err; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != ac->ac_b_ex.fe_len); +#endif + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + + return err; +} + +/* + * Idempotent helper for Ext4 fast commit replay path to set the state of + * blocks in bitmaps and update counters. + */ +void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, bool state) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group; + ext4_grpblk_t blkoff; + int err = 0; + unsigned int clen, thisgrp_len; + + while (len > 0) { + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); + + /* + * Check to see if we are freeing blocks across a group + * boundary. + * In case of flex_bg, this can happen that (block, len) may + * span across more than one group. In that case we need to + * get the corresponding group metadata to work with. + * For this we have goto again loop. + */ + thisgrp_len = min_t(unsigned int, (unsigned int)len, + EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); + clen = EXT4_NUM_B2C(sbi, thisgrp_len); + + if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { + ext4_error(sb, "Marking blocks in system zone - " + "Block = %llu, len = %u", + block, thisgrp_len); + break; + } + + err = ext4_mb_mark_context(NULL, sb, state, + group, blkoff, clen, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); + if (err) + break; + + block += thisgrp_len; + len -= thisgrp_len; + BUG_ON(len < 0); + } +} + +/* + * here we normalize request for locality group + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via + * /sys/fs/ext4/<partition>/mb_group_prealloc + * + * XXX: should we try to preallocate more than the group has now? + */ +static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); +} + +/* + * This function returns the next element to look at during inode + * PA rbtree walk. We assume that we have held the inode PA rbtree lock + * (ei->i_prealloc_lock) + * + * new_start The start of the range we want to compare + * cur_start The existing start that we are comparing against + * node The node of the rb_tree + */ +static inline struct rb_node* +ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) +{ + if (new_start < cur_start) + return node->rb_left; + else + return node->rb_right; +} + +static inline void +ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t start, loff_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; + struct rb_node *iter; + + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = pa_logical_end(sbi, tmp_pa); + + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + read_unlock(&ei->i_prealloc_lock); +} + +/* + * Given an allocation context "ac" and a range "start", "end", check + * and adjust boundaries if the range overlaps with any of the existing + * preallocatoins stored in the corresponding inode of the allocation context. + * + * Parameters: + * ac allocation context + * start start of the new range + * end end of the new range + */ +static inline void +ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t *start, loff_t *end) +{ + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; + struct rb_node *iter; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; + + new_start = *start; + new_end = *end; + + /* + * Adjust the normalized range so that it doesn't overlap with any + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock + * so it doesn't change underneath us. + */ + read_lock(&ei->i_prealloc_lock); + + /* Step 1: find any one immediate neighboring PA of the normalized range */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = pa_logical_end(sbi, tmp_pa); + + /* PA must not overlap original request */ + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + + /* + * Step 2: check if the found PA is left or right neighbor and + * get the other neighbor + */ + if (tmp_pa) { + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + + left_pa = tmp_pa; + tmp = rb_next(&left_pa->pa_node.inode_node); + if (tmp) { + right_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } else { + struct rb_node *tmp; + + right_pa = tmp_pa; + tmp = rb_prev(&right_pa->pa_node.inode_node); + if (tmp) { + left_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } + } + + /* Step 3: get the non deleted neighbors */ + if (left_pa) { + for (iter = &left_pa->pa_node.inode_node;; + iter = rb_prev(iter)) { + if (!iter) { + left_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + left_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (right_pa) { + for (iter = &right_pa->pa_node.inode_node;; + iter = rb_next(iter)) { + if (!iter) { + right_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + right_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (left_pa) { + left_pa_end = pa_logical_end(sbi, left_pa); + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); + } + + if (right_pa) { + right_pa_start = right_pa->pa_lstart; + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); + } + + /* Step 4: trim our normalized range to not overlap with the neighbors */ + if (left_pa) { + if (left_pa_end > new_start) + new_start = left_pa_end; + } + + if (right_pa) { + if (right_pa_start < new_end) + new_end = right_pa_start; + } + read_unlock(&ei->i_prealloc_lock); + + /* XXX: extra loop to check we really don't overlap preallocations */ + ext4_mb_pa_assert_overlap(ac, new_start, new_end); + + *start = new_start; + *end = new_end; +} + +/* + * Normalization means making request better in terms of + * size and alignment + */ +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; + int bsbits, max; + loff_t size, start_off, end; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + + /* do normalize only data requests, metadata requests + do not need preallocation */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + /* sometime caller may want exact blocks */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + /* caller may indicate that preallocation isn't + * required (it's a tail, for example) */ + if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) + return; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { + ext4_mb_normalize_group_request(ac); + return ; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + + /* first, let's learn actual file size + * given current request is allocated */ + size = extent_logical_end(sbi, &ac->ac_o_ex); + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + orig_size = size; + + /* max size of free chunks */ + max = 2 << bsbits; + +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) + + /* first, try to predict filesize */ + /* XXX: should this table be tunable? */ + start_off = 0; + if (size <= 16 * 1024) { + size = 16 * 1024; + } else if (size <= 32 * 1024) { + size = 32 * 1024; + } else if (size <= 64 * 1024) { + size = 64 * 1024; + } else if (size <= 128 * 1024) { + size = 128 * 1024; + } else if (size <= 256 * 1024) { + size = 256 * 1024; + } else if (size <= 512 * 1024) { + size = 512 * 1024; + } else if (size <= 1024 * 1024) { + size = 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size = 4 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), + (8<<20)>>bsbits, max, 8 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size = 8 * 1024 * 1024; + } else { + start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; + size = (loff_t) EXT4_C2B(sbi, + ac->ac_o_ex.fe_len) << bsbits; + } + size = size >> bsbits; + start = start_off >> bsbits; + + /* + * For tiny groups (smaller than 8MB) the chosen allocation + * alignment may be larger than group size. Make sure the + * alignment does not move allocation to a different group which + * makes mballoc fail assertions later. + */ + start = max(start, rounddown(ac->ac_o_ex.fe_logical, + (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + + /* avoid unnecessary preallocation that may trigger assertions */ + if (start + size > EXT_MAX_BLOCKS) + size = EXT_MAX_BLOCKS - start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { + size -= ar->lleft + 1 - start; + start = ar->lleft + 1; + } + if (ar->pright && start + size - 1 >= ar->lright) + size -= start + size - ar->lright; + + /* + * Trim allocation request for filesystems with artificially small + * groups. + */ + if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); + + end = start + size; + + ext4_mb_pa_adjust_overlap(ac, &start, &end); + + size = end - start; + + /* + * In this function "start" and "size" are normalized for better + * alignment and length such that we could preallocate more blocks. + * This normalization is done such that original request of + * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and + * "size" boundaries. + * (Note fe_len can be relaxed since FS block allocation API does not + * provide gurantee on number of contiguous blocks allocation since that + * depends upon free space left, etc). + * In case of inode pa, later we use the allocated blocks + * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated + * range of goal/best blocks [start, size] to put it at the + * ac_o_ex.fe_logical extent of this inode. + * (See ext4_mb_use_inode_pa() for more details) + */ + if (start + size <= ac->ac_o_ex.fe_logical || + start > ac->ac_o_ex.fe_logical) { + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + + /* XXX: is it better to align blocks WRT to logical + * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; + + /* define goal start in order to merge */ + if (ar->pright && (ar->lright == (start + size)) && + ar->pright >= size && + ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { + /* merge to the right */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + if (ar->pleft && (ar->lleft + 1 == start) && + ar->pleft + 1 < ext4_blocks_count(es)) { + /* merge to the left */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + + mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, + orig_size, start); +} + +static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + for (int i=0; i<EXT4_MB_NUM_CRS; i++) { + atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); + } + + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + /* did we allocate as much as normalizer originally wanted? */ + if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) + atomic_inc(&sbi->s_bal_len_goals); + + if (ac->ac_found > sbi->s_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); +} + +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + struct ext4_buddy e4b; + int err; + + if (pa == NULL) { + if (ac->ac_f_ex.fe_len == 0) + return; + err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); + if (WARN_RATELIMIT(err, + "ext4: mb_load_buddy failed (%d)", err)) + /* + * This should never happen since we pin the + * pages in the ext4_allocation_context so + * ext4_mb_load_buddy() should never fail. + */ + return; + ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, + ac->ac_f_ex.fe_len); + ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + ext4_mb_unload_buddy(&e4b); + return; + } + if (pa->pa_type == MB_INODE_PA) { + spin_lock(&pa->pa_lock); + pa->pa_free += ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } +} + +/* + * use blocks preallocated to inode + */ +static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + ext4_fsblk_t end; + int len; + + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); + ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + BUG_ON(start < pa->pa_pstart); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); + BUG_ON(pa->pa_free < len); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + pa->pa_free -= len; + + mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); +} + +/* + * use blocks preallocated to locality group + */ +static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + unsigned int len = ac->ac_o_ex.fe_len; + + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, + &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + /* we don't correct pa_pstart or pa_len here to avoid + * possible race when the group is being loaded concurrently + * instead we correct pa later, after blocks are marked + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex + */ + mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", + pa->pa_lstart, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance <= new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* + * check if found pa meets EXT4_MB_HINT_GOAL_ONLY + */ +static bool +ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + + if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) + return true; + + /* + * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted + * in ext4_mb_normalize_request and will keep same with ac_o_ex + * from ext4_mb_initialize_context. Choose ac_g_ex here to keep + * consistent with ext4_mb_find_by_goal. + */ + start = pa->pa_pstart + + (ac->ac_g_ex.fe_logical - pa->pa_lstart); + if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) + return false; + + if (ac->ac_g_ex.fe_len > pa->pa_len - + EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) + return false; + + return true; +} + +/* + * search goal blocks in preallocated space + */ +static noinline_for_stack bool +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int order, i; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_locality_group *lg; + struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; + struct rb_node *iter; + ext4_fsblk_t goal_block; + + /* only data can be preallocated */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return false; + + /* + * first, try per-file preallocation by searching the inode pa rbtree. + * + * Here, we can't do a direct traversal of the tree because + * ext4_mb_discard_group_preallocation() can paralelly mark the pa + * deleted and that can cause direct traversal to skip some entries. + */ + read_lock(&ei->i_prealloc_lock); + + if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { + goto try_group_pa; + } + + /* + * Step 1: Find a pa with logical start immediately adjacent to the + * original logical start. This could be on the left or right. + * + * (tmp_pa->pa_lstart never changes so we can skip locking for it). + */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa->pa_lstart, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + } + + /* + * Step 2: The adjacent pa might be to the right of logical start, find + * the left adjacent pa. After this step we'd have a valid tmp_pa whose + * logical start is towards the left of original request's logical start + */ + if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + tmp = rb_prev(&tmp_pa->pa_node.inode_node); + + if (tmp) { + tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, + pa_node.inode_node); + } else { + /* + * If there is no adjacent pa to the left then finding + * an overlapping pa is not possible hence stop searching + * inode pa tree + */ + goto try_group_pa; + } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); + + /* + * Step 3: If the left adjacent pa is deleted, keep moving left to find + * the first non deleted adjacent pa. After this step we should have a + * valid tmp_pa which is guaranteed to be non deleted. + */ + for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { + if (!iter) { + /* + * no non deleted left adjacent pa, so stop searching + * inode pa tree + */ + goto try_group_pa; + } + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + /* + * We will keep holding the pa_lock from + * this point on because we don't want group discard + * to delete this pa underneath us. Since group + * discard is anyways an ENOSPC operation it + * should be okay for it to wait a few more cycles. + */ + break; + } else { + spin_unlock(&tmp_pa->pa_lock); + } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); + BUG_ON(tmp_pa->pa_deleted == 1); + + /* + * Step 4: We now have the non deleted left adjacent pa. Only this + * pa can possibly satisfy the request hence check if it overlaps + * original logical start and stop searching if it doesn't. + */ + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { + spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any other PA to + * satisfy this. + */ + spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); + read_unlock(&ei->i_prealloc_lock); + return true; + } else { + /* + * We found a valid overlapping pa but couldn't use it because + * it had no free blocks. This should ideally never happen + * because: + * + * 1. When a new inode pa is added to rbtree it must have + * pa_free > 0 since otherwise we won't actually need + * preallocation. + * + * 2. An inode pa that is in the rbtree can only have it's + * pa_free become zero when another thread calls: + * ext4_mb_new_blocks + * ext4_mb_use_preallocated + * ext4_mb_use_inode_pa + * + * 3. Further, after the above calls make pa_free == 0, we will + * immediately remove it from the rbtree in: + * ext4_mb_new_blocks + * ext4_mb_release_context + * ext4_mb_put_pa + * + * 4. Since the pa_free becoming 0 and pa_free getting removed + * from tree both happen in ext4_mb_new_blocks, which is always + * called with i_data_sem held for data allocations, we can be + * sure that another process will never see a pa in rbtree with + * pa_free == 0. + */ + WARN_ON_ONCE(tmp_pa->pa_free == 0); + } + spin_unlock(&tmp_pa->pa_lock); +try_group_pa: + read_unlock(&ei->i_prealloc_lock); + + /* can we use group allocation? */ + if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) + return false; + + /* inode may have no locality group for some reason */ + lg = ac->ac_lg; + if (lg == NULL) + return false; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], + pa_node.lg_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && + tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + tmp_pa, cpa); + } + spin_unlock(&tmp_pa->pa_lock); + } + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + return true; + } + return false; +} + +/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int len; + + if (!grp) + return; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here + * notice we do NOT ignore preallocations with pa_deleted + * otherwise we could leave used blocks available for + * allocation in buddy when concurrent ext4_mb_put_pa() + * is dropping preallocation + */ + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); + if (unlikely(len == 0)) + continue; + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; + } + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); +} + +static void ext4_mb_mark_pa_deleted(struct super_block *sb, + struct ext4_prealloc_space *pa) +{ + struct ext4_inode_info *ei; + + if (pa->pa_deleted) { + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", + pa->pa_type, pa->pa_pstart, pa->pa_lstart, + pa->pa_len); + return; + } + + pa->pa_deleted = 1; + + if (pa->pa_type == MB_INODE_PA) { + ei = EXT4_I(pa->pa_inode); + atomic_dec(&ei->i_prealloc_active); + } +} + +static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) +{ + BUG_ON(!pa); + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); + kmem_cache_free(ext4_pspace_cachep, pa); +} + +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + ext4_mb_pa_free(pa); +} + +/* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed + */ +static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + struct super_block *sb, struct ext4_prealloc_space *pa) +{ + ext4_group_t grp; + ext4_fsblk_t grp_blk; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + + if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } + + ext4_mb_mark_pa_deleted(sb, pa); + spin_unlock(&pa->pa_lock); + + grp_blk = pa->pa_pstart; + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + + grp = ext4_get_group_number(sb, grp_blk); + + /* + * possible race: + * + * P1 (buddy init) P2 (regular allocation) + * find block B in PA + * copy on-disk bitmap to buddy + * mark B in on-disk bitmap + * drop PA from group + * mark all PAs in buddy + * + * thus, P1 initializes buddy with B available. to prevent this + * we make "copy" and "mark all PAs" atomic and serialize "drop PA" + * against that pair + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); + ext4_unlock_group(sb, grp); + + if (pa->pa_type == MB_INODE_PA) { + write_lock(pa->pa_node_lock.inode_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_free(pa); + } else { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + struct ext4_prealloc_space *iter_pa, *new_pa; + ext4_lblk_t iter_start, new_start; + + while (*iter) { + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, + pa_node.inode_node); + new_pa = rb_entry(new, struct ext4_prealloc_space, + pa_node.inode_node); + iter_start = iter_pa->pa_lstart; + new_start = new_pa->pa_lstart; + + parent = *iter; + if (new_start < iter_start) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); +} + +/* + * creates new preallocated space for given inode + */ +static noinline_for_stack void +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + struct ext4_inode_info *ei; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); + + pa = ac->ac_pa; + + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); + loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); + + /* + * We can't allocate as much as normalizer wants, so we try + * to get proper lstart to cover the original request, except + * when the goal doesn't cover the original request as below: + * + * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 + * best_ex:0/200(200) -> adjusted: 1848/2048(200) + */ + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); + + /* + * Use the below logic for adjusting best extent as it keeps + * fragmentation in check while ensuring logical range of best + * extent doesn't overflow out of goal extent: + * + * 1. Check if best ex can be kept at end of goal (before + * cr_best_avail trimmed it) and still cover original start + * 2. Else, check if best ex can be kept at start of goal and + * still cover original end + * 3. Else, keep the best ex at start of original request. + */ + ex.fe_len = ac->ac_b_ex.fe_len; + + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) + goto adjust_bex; + + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (o_ex_end <= extent_logical_end(sbi, &ex)) + goto adjust_bex; + + ex.fe_logical = ac->ac_o_ex.fe_logical; +adjust_bex: + ac->ac_b_ex.fe_logical = ex.fe_logical; + + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); + } + + pa->pa_lstart = ac->ac_b_ex.fe_logical; + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; + + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); + + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + ext4_mb_use_inode_pa(ac, pa); + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; + + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); + write_unlock(pa->pa_node_lock.inode_lock); + atomic_inc(&ei->i_prealloc_active); +} + +/* + * creates new preallocated space for locality group inodes belongs to + */ +static noinline_for_stack void +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); + + pa = ac->ac_pa; + + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_node.lg_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; + + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); + + ext4_mb_use_group_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; + lg = ac->ac_lg; + BUG_ON(lg == NULL); + + pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; + pa->pa_inode = NULL; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ +} + +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +{ + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + ext4_mb_new_group_pa(ac); + else + ext4_mb_new_inode_pa(ac); +} + +/* + * finds all unused blocks in on-disk bitmap, frees them in + * in-core bitmap and buddy. + * @pa must be unlinked from inode and group lists, so that + * nobody else can find/use it. + * the caller MUST hold group/inode locks. + * TODO: optimize the case when there are no in-core structures yet + */ +static noinline_for_stack void +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int end; + unsigned int next; + ext4_group_t group; + ext4_grpblk_t bit; + unsigned long long grp_blk_start; + int free = 0; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + end = bit + pa->pa_len; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); + if (bit >= end) + break; + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); + mb_debug(sb, "free preallocated %u/%u in group %u\n", + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); + free += next - bit; + + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), + next - bit); + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } + if (free != pa->pa_free) { + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %d", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } + atomic_add(free, &sbi->s_mb_discarded); +} + +static noinline_for_stack void +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + ext4_group_t group; + ext4_grpblk_t bit; + + trace_ext4_mb_release_group_pa(sb, pa); + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { + ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", + e4b->bd_group, group, pa->pa_pstart); + return; + } + mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); + atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); +} + +/* + * releases all preallocations in given group + * + * first, we need to decide discard policy: + * - when do we discard + * 1) ENOSPC + * - how many do we discard + * 1) how many requested + */ +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, + ext4_group_t group, int *busy) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + LIST_HEAD(list); + struct ext4_buddy e4b; + struct ext4_inode_info *ei; + int err; + int free = 0; + + if (!grp) + return 0; + mb_debug(sb, "discard preallocation for group %u\n", group); + if (list_empty(&grp->bb_prealloc_list)) + goto out_dbg; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error_err(sb, -err, + "Error %d reading block bitmap for %u", + err, group); + goto out_dbg; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_warning(sb, "Error %d loading buddy information for %u", + err, group); + put_bh(bitmap_bh); + goto out_dbg; + } + + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); + *busy = 1; + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + /* seems this one can be freed ... */ + ext4_mb_mark_pa_deleted(sb, pa); + + if (!free) + this_cpu_inc(discard_pa_seq); + + /* we can trust pa_free ... */ + free += pa->pa_free; + + spin_unlock(&pa->pa_lock); + + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } + + /* now free all selected PAs */ + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + + /* remove from object (inode or locality group) */ + if (pa->pa_type == MB_GROUP_PA) { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + } else { + write_lock(pa->pa_node_lock.inode_lock); + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + } + + list_del(&pa->u.pa_tmp_list); + + if (pa->pa_type == MB_GROUP_PA) { + ext4_mb_release_group_pa(&e4b, pa); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } else { + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_mb_pa_free(pa); + } + } + + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); +out_dbg: + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + free, group, grp->bb_free); + return free; +} + +/* + * releases all non-used preallocated blocks for given inode + * + * It's important to discard preallocations under i_data_sem + * We don't want another block to be served from the prealloc + * space when we are discarding the inode prealloc space. + * + * FIXME!! Make sure it is valid at all the call sites + */ +void ext4_discard_preallocations(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct super_block *sb = inode->i_sb; + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + ext4_group_t group = 0; + LIST_HEAD(list); + struct ext4_buddy e4b; + struct rb_node *iter; + int err; + + if (!S_ISREG(inode->i_mode)) + return; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + mb_debug(sb, "discard preallocation for inode %lu\n", + inode->i_ino); + trace_ext4_discard_preallocations(inode, + atomic_read(&ei->i_prealloc_active)); + +repeat: + /* first, collect all pa's in the inode */ + write_lock(&ei->i_prealloc_lock); + for (iter = rb_first(&ei->i_prealloc_node); iter; + iter = rb_next(iter)) { + pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); + + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* this shouldn't happen often - nobody should + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + write_unlock(&ei->i_prealloc_lock); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; + + } + if (pa->pa_deleted == 0) { + ext4_mb_mark_pa_deleted(sb, pa); + spin_unlock(&pa->pa_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + list_add(&pa->u.pa_tmp_list, &list); + continue; + } + + /* someone is deleting pa right now */ + spin_unlock(&pa->pa_lock); + write_unlock(&ei->i_prealloc_lock); + + /* we have to wait here because pa_deleted + * doesn't mean pa is already unlinked from + * the list. as we might be called from + * ->clear_inode() the inode will get freed + * and concurrent thread which is unlinking + * pa from inode's list may access already + * freed memory, bad-bad-bad */ + + /* XXX: if this happens too often, we can + * add a flag to force wait only in case + * of ->clear_inode(), but not in case of + * regular truncate */ + schedule_timeout_uninterruptible(HZ); + goto repeat; + } + write_unlock(&ei->i_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + BUG_ON(pa->pa_type != MB_INODE_PA); + group = ext4_get_group_number(sb, pa->pa_pstart); + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); + continue; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", + err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); + ext4_mb_pa_free(pa); + } +} + +static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa; + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); + if (!pa) + return -ENOMEM; + atomic_set(&pa->pa_count, 1); + ac->ac_pa = pa; + return 0; +} + +static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); + /* + * current function is only called due to an error or due to + * len of found blocks < len of requested blocks hence the PA has not + * been added to grp->bb_prealloc_list. So we don't need to lock it + */ + pa->pa_deleted = 1; + ext4_mb_pa_free(pa); +} + +#ifdef CONFIG_EXT4_DEBUG +static inline void ext4_mb_show_pa(struct super_block *sb) +{ + ext4_group_t i, ngroups; + + if (ext4_emergency_state(sb)) + return; + + ngroups = ext4_get_groups_count(sb); + mb_debug(sb, "groups: "); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + + if (!grp) + continue; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + mb_debug(sb, "PA:%u:%d:%d\n", i, start, + pa->pa_len); + } + ext4_unlock_group(sb, i); + mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, + grp->bb_fragments); + } +} + +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + + if (ext4_emergency_state(sb)) + return; + + mb_debug(sb, "Can't allocate:" + " Allocation context details:"); + mb_debug(sb, "status %u flags 0x%x", + ac->ac_status, ac->ac_flags); + mb_debug(sb, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + mb_debug(sb, "%u found", ac->ac_found); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); + if (ac->ac_pa) + mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? + "group pa" : "inode pa"); + ext4_mb_show_pa(sb); +} +#else +static inline void ext4_mb_show_pa(struct super_block *sb) +{ +} +static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + ext4_mb_show_pa(ac->ac_sb); +} +#endif + +/* + * We use locality group preallocation for small size file. The size of the + * file is determined by the current size or the resulting size after + * allocation which ever is larger + * + * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req + */ +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; + bool inode_pa_eligible, group_pa_eligible; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + group_pa_eligible = sbi->s_mb_group_prealloc > 0; + inode_pa_eligible = true; + size = extent_logical_end(sbi, &ac->ac_o_ex); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + + /* No point in using inode preallocation for closed files */ + if ((size == isize) && !ext4_fs_is_busy(sbi) && + !inode_is_open_for_write(ac->ac_inode)) + inode_pa_eligible = false; + + size = max(size, isize); + /* Don't use group allocation for large files */ + if (size > sbi->s_mb_stream_request) + group_pa_eligible = false; + + if (!group_pa_eligible) { + if (inode_pa_eligible) + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + else + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having + * per cpu locality group is to reduce the contention between block + * request from multiple CPUs. + */ + ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); + + /* we're going to use group allocation */ + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; + + /* serialize all allocations in the group */ + mutex_lock(&ac->ac_lg->lg_mutex); +} + +static noinline_for_stack void +ext4_mb_initialize_context(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t group; + unsigned int len; + ext4_fsblk_t goal; + ext4_grpblk_t block; + + /* we can't allocate > group size */ + len = ar->len; + + /* just a dirty hack to filter too big requests */ + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); + + /* start searching from the goal */ + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + ext4_get_group_no_and_offset(sb, goal, &group, &block); + + /* set up allocation goals */ + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; + ac->ac_o_ex.fe_group = group; + ac->ac_o_ex.fe_start = block; + ac->ac_o_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; + ac->ac_flags = ar->flags; + + /* we have to define context: we'll work with a file or + * locality group. this is a policy, actually */ + ext4_mb_group_or_file(ac); + + mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " + "left: %u/%u, right %u/%u to %swritable\n", + (unsigned) ar->len, (unsigned) ar->logical, + (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, + (unsigned) ar->lleft, (unsigned) ar->pleft, + (unsigned) ar->lright, (unsigned) ar->pright, + inode_is_open_for_write(ar->inode) ? "" : "non-"); +} + +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + LIST_HEAD(discard_list); + struct ext4_prealloc_space *pa, *tmp; + + mb_debug(sb, "discard locality group preallocation\n"); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_node.lg_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(pa->pa_type != MB_GROUP_PA); + + /* seems this one can be freed ... */ + ext4_mb_mark_pa_deleted(sb, pa); + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_node.lg_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + int err; + + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_node.lg_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_node.lg_list, + &tmp_pa->pa_node.lg_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_node.lg_list, + &lg->lg_prealloc_list[order]); + spin_unlock(&lg->lg_prealloc_lock); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); +} + +/* + * release all resource we used in allocation + */ +static void ext4_mb_release_context(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_type == MB_GROUP_PA) { + /* see comment in ext4_mb_use_group_pa() */ + spin_lock(&pa->pa_lock); + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if (likely(pa->pa_free)) { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + ext4_mb_add_n_trim(ac); + } + } + + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + mutex_unlock(&ac->ac_lg->lg_mutex); + ext4_mb_collect_stats(ac); +} + +static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) +{ + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int ret; + int freed = 0, busy = 0; + int retry = 0; + + trace_ext4_mb_discard_preallocations(sb, needed); + + if (needed == 0) + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; + repeat: + for (i = 0; i < ngroups && needed > 0; i++) { + ret = ext4_mb_discard_group_preallocations(sb, i, &busy); + freed += ret; + needed -= ret; + cond_resched(); + } + + if (needed > 0 && busy && ++retry < 3) { + busy = 0; + goto repeat; + } + + return freed; +} + +static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, + struct ext4_allocation_context *ac, u64 *seq) +{ + int freed; + u64 seq_retry = 0; + bool ret = false; + + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) { + ret = true; + goto out_dbg; + } + seq_retry = ext4_get_discard_pa_seq_sum(); + if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { + ac->ac_flags |= EXT4_MB_STRICT_CHECK; + *seq = seq_retry; + ret = true; + } + +out_dbg: + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); + return ret; +} + +/* + * Simple allocator for Ext4 fast commit replay path. It searches for blocks + * linearly starting at the goal block and also excludes the blocks which + * are going to be in use after fast commit replay. + */ +static ext4_fsblk_t +ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group, nr; + ext4_grpblk_t blkoff; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_fsblk_t goal, block; + struct ext4_super_block *es = sbi->s_es; + + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + + ar->len = 0; + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + *errp = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return 0; + } + + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, + blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { + blkoff = i + 1; + } else + break; + } + brelse(bitmap_bh); + if (i < max) + break; + + if (++group >= ext4_get_groups_count(sb)) + group = 0; + + blkoff = 0; + } + + if (i >= max) { + *errp = -ENOSPC; + return 0; + } + + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); + ext4_mb_mark_bb(sb, block, 1, true); + ar->len = 1; + + *errp = 0; + return block; +} + +/* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back + * to usual allocation + */ +ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; + int retries = 0; + u64 seq; + + might_sleep(); + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); + + trace_ext4_request_blocks(ar); + if (sbi->s_mount_state & EXT4_FC_REPLAY) + return ext4_mb_new_blocks_simple(ar, errp); + + /* Allow to use superuser reservation for quota file */ + if (ext4_is_quota_file(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. + */ + while (ar->len && + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { + + /* let others to free the space */ + cond_resched(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + ext4_mb_show_pa(sb); + *errp = -ENOSPC; + return 0; + } + reserv_clstrs = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + } + inquota = ar->len; + if (ar->len == 0) { + *errp = -EDQUOT; + goto out; + } + } + + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); + if (!ac) { + ar->len = 0; + *errp = -ENOMEM; + goto out; + } + + ext4_mb_initialize_context(ac, ar); + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + seq = this_cpu_read(discard_pa_seq); + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); + + *errp = ext4_mb_pa_alloc(ac); + if (*errp) + goto errout; +repeat: + /* allocate space in core */ + *errp = ext4_mb_regular_allocator(ac); + /* + * pa allocated above is added to grp->bb_prealloc_list only + * when we were able to allocate some block i.e. when + * ac->ac_status == AC_STATUS_FOUND. + * And error from above mean ac->ac_status != AC_STATUS_FOUND + * So we have to free this pa here itself. + */ + if (*errp) { + ext4_mb_pa_put_free(ac); + ext4_discard_allocated_blocks(ac); + goto errout; + } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) + ext4_mb_pa_put_free(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + if (*errp) { + ext4_discard_allocated_blocks(ac); + goto errout; + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } + } else { + if (++retries < 3 && + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + /* + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ + ext4_mb_pa_put_free(ac); + *errp = -ENOSPC; + } + + if (*errp) { +errout: + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } + ext4_mb_release_context(ac); + kmem_cache_free(ext4_ac_cachep, ac); +out: + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); + if (!ar->len) { + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + } + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); + + return block; +} + +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static inline bool +ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if (entry1->efd_tid != entry2->efd_tid) + return false; + if (entry1->efd_start_cluster + entry1->efd_count != + entry2->efd_start_cluster) + return false; + if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) + return false; + return true; +} + +static inline void +ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + entry1->efd_count += entry2->efd_count; + spin_lock(&sbi->s_md_lock); + list_del(&entry2->efd_list); + spin_unlock(&sbi->s_md_lock); + rb_erase(&entry2->efd_node, root); + kmem_cache_free(ext4_free_data_cachep, entry2); +} + +static inline void +ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *prev; + struct rb_node *node; + + node = rb_prev(&entry->efd_node); + if (!node) + return; + + prev = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(prev, entry)) + ext4_merge_freed_extents(sbi, root, prev, entry); +} + +static inline void +ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *next; + struct rb_node *node; + + node = rb_next(&entry->efd_node); + if (!node) + return; + + next = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(entry, next)) + ext4_merge_freed_extents(sbi, root, entry, next); +} + +static noinline_for_stack void +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) +{ + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t cluster; + ext4_grpblk_t clusters = new_entry->efd_count; + struct ext4_free_data *entry = NULL; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_root *root = &db->bb_free_root; + struct rb_node **n = &root->rb_node; + struct rb_node *parent = NULL, *new_node; + + BUG_ON(!ext4_handle_valid(handle)); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); + + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) + n = &(*n)->rb_left; + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), + "Block already on to-be-freed list"); + kmem_cache_free(ext4_free_data_cachep, new_entry); + return; + } + } + + atomic_add(clusters, &sbi->s_mb_free_pending); + if (!entry) + goto insert; + + /* Now try to see the extent can be merged to prev and next */ + if (ext4_freed_extents_can_be_merged(new_entry, entry)) { + entry->efd_start_cluster = cluster; + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_prev(sbi, root, entry); + return; + } + if (ext4_freed_extents_can_be_merged(entry, new_entry)) { + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_next(sbi, root, entry); + return; + } +insert: + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, root); + + spin_lock(&sbi->s_md_lock); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); + spin_unlock(&sbi->s_md_lock); +} + +static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, + unsigned long count) +{ + struct super_block *sb = inode->i_sb; + ext4_group_t group; + ext4_grpblk_t blkoff; + + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); +} + +/** + * ext4_mb_clear_bb() -- helper function for freeing blocks. + * Used by ext4_free_blocks() + * @handle: handle for this transaction + * @inode: inode + * @block: starting physical block to be freed + * @count: number of blocks to be freed + * @flags: flags used by ext4_free_blocks + */ +static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, + int flags) +{ + struct super_block *sb = inode->i_sb; + struct ext4_group_info *grp; + unsigned int overflow; + ext4_grpblk_t bit; + ext4_group_t block_group; + struct ext4_sb_info *sbi; + struct ext4_buddy e4b; + unsigned int count_clusters; + int err = 0; + int mark_flags = 0; + ext4_grpblk_t changed; + + sbi = EXT4_SB(sb); + + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_out; + } + flags |= EXT4_FREE_BLOCKS_VALIDATED; + +do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + + grp = ext4_get_group_info(sb, block_group); + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return; + + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); + count -= overflow; + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; + } + count_clusters = EXT4_NUM_B2C(sbi, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) + goto error_out; + + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_clean; + } + +#ifdef AGGRESSIVE_CHECK + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; +#endif + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + count_clusters, mark_flags, &changed); + + + if (err && changed == 0) + goto error_clean; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != count_clusters); +#endif + + /* + * We need to make sure we don't reuse the freed block until after the + * transaction is committed. We make an exception if the inode is to be + * written in writeback mode since writeback mode has weak data + * consistency guarantees. + */ + if (ext4_handle_valid(handle) && + ((flags & EXT4_FREE_BLOCKS_METADATA) || + !ext4_should_writeback_data(inode))) { + struct ext4_free_data *new_entry; + /* + * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed + * to fail. + */ + new_entry = kmem_cache_alloc(ext4_free_data_cachep, + GFP_NOFS|__GFP_NOFAIL); + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; + + ext4_lock_group(sb, block_group); + ext4_mb_free_metadata(handle, &e4b, new_entry); + } else { + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%u block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } + + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + + ext4_lock_group(sb, block_group); + mb_free_blocks(inode, &e4b, bit, count_clusters); + } + + ext4_unlock_group(sb, block_group); + + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } + + if (overflow && !err) { + block += count; + count = overflow; + ext4_mb_unload_buddy(&e4b); + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; + goto do_more; + } + +error_clean: + ext4_mb_unload_buddy(&e4b); +error_out: + ext4_std_error(sb, err); +} + +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @bh: optional buffer of the block to be freed + * @block: starting physical block to be freed + * @count: number of blocks to be freed + * @flags: flags used by ext4_free_blocks + */ +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) +{ + struct super_block *sb = inode->i_sb; + unsigned int overflow; + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } + + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); + return; + } + + might_sleep(); + + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); + return; + } + flags |= EXT4_FREE_BLOCKS_VALIDATED; + + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, flags); + + if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + BUG_ON(count > 1); + + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, bh, block); + } + + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = EXT4_PBLK_COFF(sbi, block); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; + } + overflow = EXT4_LBLK_COFF(sbi, count); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; + } + + if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + int i; + int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; + + for (i = 0; i < count; i++) { + cond_resched(); + if (is_metadata) + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); + ext4_forget(handle, is_metadata, inode, bh, block + i); + } + } + + ext4_mb_clear_bb(handle, inode, block, count, flags); +} + +/** + * ext4_group_add_blocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physical block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + ext4_group_t block_group; + ext4_grpblk_t bit; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0; + ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); + ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); + unsigned long cluster_count = last_cluster - first_cluster + 1; + ext4_grpblk_t changed; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + if (cluster_count == 0) + return 0; + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { + ext4_warning(sb, "too many blocks added to group %u", + block_group); + err = -EINVAL; + goto error_out; + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_out; + + if (!ext4_sb_block_valid(sb, NULL, block, count)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_clean; + } + + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, + &changed); + if (err && changed == 0) + goto error_clean; + + if (changed != cluster_count) + ext4_error(sb, "bit already cleared in group %u", block_group); + + ext4_lock_group(sb, block_group); + mb_free_blocks(NULL, &e4b, bit, cluster_count); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + changed); + +error_clean: + ext4_mb_unload_buddy(&e4b); +error_out: + ext4_std_error(sb, err); + return err; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, + int start, int count, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) +{ + struct ext4_free_extent ex; + ext4_group_t group = e4b->bd_group; + int ret = 0; + + trace_ext4_trim_extent(sb, group, start, count); + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ret = ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + unsigned long nr_clusters_in_group; + + if (grp < (ext4_get_groups_count(sb) - 1)) + nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); + else + nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp)) + >> EXT4_CLUSTER_BITS(sb); + + return nr_clusters_in_group - 1; +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks) +__acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) +__releases(ext4_group_lock_ptr(sb, e4b->bd_group)) +{ + ext4_grpblk_t next, count, free_count, last, origin_start; + bool set_trimmed = false; + void *bitmap; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + + last = ext4_last_grp_cluster(sb, e4b->bd_group); + bitmap = e4b->bd_bitmap; + if (start == 0 && max >= last) + set_trimmed = true; + origin_start = start; + start = max(e4b->bd_info->bb_first_free, start); + count = 0; + free_count = 0; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + + next = mb_find_next_bit(bitmap, last + 1, start); + if (origin_start == 0 && next >= last) + set_trimmed = true; + + if ((next - start) >= minblocks) { + int ret = ext4_trim_extent(sb, start, next - start, e4b); + + if (ret && ret != -EOPNOTSUPP) + return count; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (ext4_trim_interrupted()) + return count; + + if (need_resched()) { + ext4_unlock_group(sb, e4b->bd_group); + cond_resched(); + ext4_lock_group(sb, e4b->bd_group); + } + + if ((e4b->bd_info->bb_free - free_count) < minblocks) + break; + } + + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + + return count; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + struct ext4_buddy e4b; + int ret; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); + return ret; + } + + ext4_lock_group(sb, group); + + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < EXT4_SB(sb)->s_last_trim_minblks) + ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); + else + ret = 0; + + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + ret, group); + + return ret; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); + struct ext4_group_info *grp; + ext4_group_t group, first_group, last_group; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; + uint64_t start, end, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); + + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } + if (end >= max_blks - 1) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_cluster); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, + &last_group, &last_cluster); + + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; + grp = ext4_get_group_info(sb, group); + if (!grp) + continue; + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group, GFP_NOFS); + if (ret) + break; + } + + /* + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() + */ + if (group == last_group) + end = last_cluster; + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_cluster, + end, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + trimmed += cnt; + } + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ + first_cluster = 0; + } + + if (!ret) + EXT4_SB(sb)->s_last_trim_minblks = minlen; + +out: + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; + return ret; +} + +/* Iterate all the free extents in the group. */ +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t first, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, + ext4_mballoc_query_range_fn formatter, + void *priv) +{ + void *bitmap; + ext4_grpblk_t start, next; + struct ext4_buddy e4b; + int error; + + error = ext4_mb_load_buddy(sb, group, &e4b); + if (error) + return error; + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + + start = max(e4b.bd_info->bb_first_free, first); + if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } + while (start <= end) { + start = mb_find_next_zero_bit(bitmap, end + 1, start); + if (start > end) + break; + next = mb_find_next_bit(bitmap, end + 1, start); + + ext4_unlock_group(sb, group); + error = formatter(sb, group, start, next - start, priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + + start = next + 1; + } + + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(&e4b); + + return error; +} + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "mballoc-test.c" +#endif -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy namei.c from Linux v6.18 fs/ext4 directory. This file implements directory operations including lookup, create, link, unlink, and rename. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/namei.c | 4241 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4241 insertions(+) create mode 100644 fs/ext4l/namei.c diff --git a/fs/ext4l/namei.c b/fs/ext4l/namei.c new file mode 100644 index 00000000000..2cd36f59c9e --- /dev/null +++ b/fs/ext4l/namei.c @@ -0,0 +1,4241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/time.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/iversion.h> +#include <linux/unicode.h> +#include "ext4.h" +#include "ext4_jbd2.h" + +#include "xattr.h" +#include "acl.h" + +#include <trace/events/ext4.h> +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) +{ + struct ext4_map_blocks map; + struct buffer_head *bh; + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && + ((inode->i_size >> 10) >= + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + map.m_lblk = *block; + map.m_len = 1; + + /* + * We're appending new directory block. Make sure the block is not + * allocated yet, otherwise we will end up corrupting the + * directory. + */ + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err < 0) + return ERR_PTR(err); + if (err) { + EXT4_ERROR_INODE(inode, "Logical block already allocated"); + return ERR_PTR(-EFSCORRUPTED); + } + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); + if (IS_ERR(bh)) + return bh; + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); + if (err) + goto out; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto out; + return bh; + +out: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +/* + * Hints to ext4_read_dirblock regarding whether we expect a directory + * block being read to be an index block, or a block containing + * directory entries (and if the latter, whether it was found via a + * logical block in an htree index block). This is used to control + * what sort of sanity checkinig ext4_read_dirblock() will do on the + * directory block read from the storage device. EITHER will means + * the caller doesn't know what kind of directory block will be read, + * so no specific verification will be done. + */ +typedef enum { + EITHER, INDEX, DIRENT, DIRENT_HTREE +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ + __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + ext4_lblk_t block, + dirblock_type_t type, + const char *func, + unsigned int line) +{ + struct buffer_head *bh; + struct ext4_dir_entry *dirent; + int is_dx_block = 0; + + if (block >= inode->i_size >> inode->i_blkbits) { + ext4_error_inode(inode, func, line, block, + "Attempting to read directory block (%u) that is past i_size (%llu)", + block, inode->i_size); + return ERR_PTR(-EFSCORRUPTED); + } + + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else + bh = ext4_bread(NULL, inode, block, 0); + if (IS_ERR(bh)) { + __ext4_warning(inode->i_sb, func, line, + "inode #%lu: lblock %lu: comm %s: " + "error %ld reading directory block", + inode->i_ino, (unsigned long)block, + current->comm, PTR_ERR(bh)); + + return bh; + } + /* The first directory block must not be a hole. */ + if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { + ext4_error_inode(inode, func, line, block, + "Directory hole found for htree %s block %u", + (type == INDEX) ? "index" : "leaf", block); + return ERR_PTR(-EFSCORRUPTED); + } + if (!bh) + return NULL; + dirent = (struct ext4_dir_entry *) bh->b_data; + /* Determine whether or not we have an index block */ + if (is_dx(inode)) { + if (block == 0) + is_dx_block = 1; + else if (ext4_rec_len_from_disk(dirent->rec_len, + inode->i_sb->s_blocksize) == + inode->i_sb->s_blocksize) + is_dx_block = 1; + } + if (!is_dx_block && type == INDEX) { + ext4_error_inode(inode, func, line, block, + "directory leaf block found instead of index block"); + brelse(bh); + return ERR_PTR(-EFSCORRUPTED); + } + if (!ext4_has_feature_metadata_csum(inode->i_sb) || + buffer_verified(bh)) + return bh; + + /* + * An empty leaf block can get mistaken for a index block; for + * this reason, we can only check the index checksum when the + * caller is sure it should be an index block. + */ + if (is_dx_block && type == INDEX) { + if (ext4_dx_csum_verify(inode, dirent) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) + set_buffer_verified(bh); + else { + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory index failed checksum"); + brelse(bh); + return ERR_PTR(-EFSBADCRC); + } + } + if (!is_dx_block) { + if (ext4_dirblock_csum_verify(inode, bh) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) + set_buffer_verified(bh); + else { + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory block failed checksum"); + brelse(bh); + return ERR_PTR(-EFSBADCRC); + } + } + return bh; +} + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +/* + * This goes at the end of each htree block. + */ +struct dx_tail { + u32 dt_reserved; + __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode); + +/* checksumming functions */ +void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize) +{ + struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); + + memset(t, 0, sizeof(struct ext4_dir_entry_tail)); + t->det_rec_len = ext4_rec_len_to_disk( + sizeof(struct ext4_dir_entry_tail), blocksize); + t->det_reserved_ft = EXT4_FT_DIR_CSUM; +} + +/* Walk through a dirent block to find a checksum "dirent" at the tail */ +static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, + struct buffer_head *bh) +{ + struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + +#ifdef PARANOID + struct ext4_dir_entry *d, *top; + + d = (struct ext4_dir_entry *)bh->b_data; + top = (struct ext4_dir_entry *)(bh->b_data + + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) + d = (struct ext4_dir_entry *)(((void *)d) + + ext4_rec_len_from_disk(d->rec_len, blocksize)); + + if (d != top) + return NULL; + + t = (struct ext4_dir_entry_tail *)d; +#else + t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); +#endif + + if (t->det_reserved_zero1 || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || + t->det_reserved_zero2 || + t->det_reserved_ft != EXT4_FT_DIR_CSUM) + return NULL; + + return t; +} + +static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + return cpu_to_le32(csum); +} + +#define warn_no_space_for_csum(inode) \ + __warn_no_space_for_csum((inode), __func__, __LINE__) + +static void __warn_no_space_for_csum(struct inode *inode, const char *func, + unsigned int line) +{ + __ext4_warning_inode(inode, func, line, + "No space for directory leaf checksum. Please run e2fsck -D."); +} + +int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) +{ + struct ext4_dir_entry_tail *t; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return 1; + + t = get_dirent_tail(inode, bh); + if (!t) { + warn_no_space_for_csum(inode); + return 0; + } + + if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, + (char *)t - bh->b_data)) + return 0; + + return 1; +} + +static void ext4_dirblock_csum_set(struct inode *inode, + struct buffer_head *bh) +{ + struct ext4_dir_entry_tail *t; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + t = get_dirent_tail(inode, bh); + if (!t) { + warn_no_space_for_csum(inode); + return; + } + + t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, + (char *)t - bh->b_data); +} + +int ext4_handle_dirty_dirblock(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dirblock_csum_set(inode, bh); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + struct ext4_dir_entry *dirent, + int *offset) +{ + struct ext4_dir_entry *dp; + struct dx_root_info *root; + int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); + + if (rlen == blocksize) + count_offset = 8; + else if (rlen == 12) { + dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) + return NULL; + root = (struct dx_root_info *)(((void *)dp + 12)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; + count_offset = 32; + } else + return NULL; + + if (offset) + *offset = count_offset; + return (struct dx_countlimit *)(((void *)dirent) + count_offset); +} + +static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, + int count_offset, int count, struct dx_tail *t) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + int size; + __u32 dummy_csum = 0; + int offset = offsetof(struct dx_tail, dt_checksum); + + size = count_offset + (count * sizeof(struct dx_entry)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + + return cpu_to_le32(csum); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return 1; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return 0; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return 0; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, + count, t)) + return 0; + return 1; +} + +static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); +} + +static inline int ext4_handle_dirty_dx_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len, blocksize)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x0fffffff; +} + +static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash(struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +{ + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(1, NULL) - + ext4_dir_rec_len(2, NULL) - infosize; + + if (ext4_has_feature_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit(struct inode *dir) +{ + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(0, dir); + + if (ext4_has_feature_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index(char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk(KERN_DEBUG "%s index", label); + for (i = 0; i < n; i++) { + printk(KERN_CONT " %x->%lu", + i ? dx_get_hash(entries + i) : 0, + (unsigned long)dx_get_block(entries + i)); + } + printk(KERN_CONT "\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct inode *dir, + struct dx_hash_info *hinfo, + struct ext4_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { +#ifdef CONFIG_FS_ENCRYPTION + int len; + char *name; + struct fscrypt_str fname_crypto_str = + FSTR_INIT(NULL, 0); + int res = 0; + + name = de->name; + len = de->name_len; + if (!IS_ENCRYPTED(dir)) { + /* Directory is not encrypted */ + (void) ext4fs_dirhash(dir, de->name, + de->name_len, &h); + printk("%*.s:(U)%x.%u ", len, + name, h.hash, + (unsigned) ((char *) de + - base)); + } else { + struct fscrypt_str de_name = + FSTR_INIT(name, len); + + /* Directory is encrypted */ + res = fscrypt_fname_alloc_buffer( + len, &fname_crypto_str); + if (res) + printk(KERN_WARNING "Error " + "allocating crypto " + "buffer--skipping " + "crypto\n"); + res = fscrypt_fname_disk_to_usr(dir, + 0, 0, &de_name, + &fname_crypto_str); + if (res) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to usr" + "\n"); + name = "??"; + len = 2; + } else { + name = fname_crypto_str.name; + len = fname_crypto_str.len; + } + if (IS_CASEFOLDED(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + (void) ext4fs_dirhash(dir, + de->name, + de->name_len, &h); + printk("%*.s:(E)%x.%u ", len, name, + h.hash, (unsigned) ((char *) de + - base)); + fscrypt_fname_free_buffer( + &fname_crypto_str); + } +#else + int len = de->name_len; + char *name = de->name; + (void) ext4fs_dirhash(dir, de->name, + de->name_len, &h); + printk("%*.s:%x.%u ", len, name, h.hash, + (unsigned) ((char *) de - base)); +#endif + } + space += ext4_dir_rec_len(de->name_len, dir); + names++; + } + de = ext4_next_entry(de, size); + } + printk(KERN_CONT "(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + ext4_lblk_t block = dx_get_block(entries); + ext4_lblk_t hash = i ? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + bh = ext4_bread(NULL,dir, block, 0); + if (!bh || IS_ERR(bh)) + continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) + bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse(bh); + } + if (bcount) + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} + +/* + * Linear search cross check + */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ + while (n--) { + dxtrace(printk(KERN_CONT ",")); + if (dx_get_hash(++at) > hash) { + at--; + break; + } + } + ASSERT(at == target - 1); +} +#else /* DX_DEBUG */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(struct ext4_filename *fname, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in) +{ + unsigned count, indirect, level, i; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); + u32 hash; + ext4_lblk_t block; + ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; + + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + + root = (struct dx_root *) frame->bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY && + root->info.hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, "Unrecognised inode hash code %u", + root->info.hash_version); + goto fail; + } + if (ext4_hash_in_dirent(dir)) { + if (root->info.hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash in dirent, but hash is not SIPHASH"); + goto fail; + } + } else { + if (root->info.hash_version == DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash code is SIPHASH, but hash not in dirent"); + goto fail; + } + } + if (fname) + hinfo = &fname->hinfo; + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; + /* hash is already computed for encrypted casefolded directory */ + if (fname && fname_name(fname) && + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { + int ret = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), hinfo); + if (ret < 0) { + ret_err = ERR_PTR(ret); + goto fail; + } + } + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", + root->info.unused_flags); + goto fail; + } + + indirect = root->info.indirect_levels; + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } + goto fail; + } + + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), + dx_root_limit(dir, root->info.info_length)); + goto fail; + } + + dxtrace(printk("Look up %x", hash)); + level = 0; + blocks[0] = 0; + while (1) { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); + goto fail; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) { + m = p + (q - p) / 2; + dxtrace(printk(KERN_CONT ".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + htree_rep_invariant_check(entries, p, hash, count - 1); + + at = p - 1; + dxtrace(printk(KERN_CONT " %x->%u\n", + at == entries ? 0 : dx_get_hash(at), + dx_get_block(at))); + frame->entries = entries; + frame->at = at; + + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { + ext4_warning_inode(dir, + "dx entry: tree cycle block %u points back to block %u", + blocks[level], block); + goto fail; + } + } + if (++level > indirect) + return frame; + blocks[level] = block; + frame++; + frame->bh = ext4_read_dirblock(dir, block, INDEX); + if (IS_ERR(frame->bh)) { + ret_err = (struct dx_frame *) frame->bh; + frame->bh = NULL; + goto fail; + } + + entries = ((struct dx_node *) frame->bh->b_data)->entries; + + if (dx_get_limit(entries) != dx_node_limit(dir)) { + ext4_warning_inode(dir, + "dx entry: limit %u != node limit %u", + dx_get_limit(entries), dx_node_limit(dir)); + goto fail; + } + } +fail: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } + + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) + ext4_warning_inode(dir, + "Corrupt directory, running e2fsck is recommended"); + return ret_err; +} + +static void dx_release(struct dx_frame *frames) +{ + struct dx_root_info *info; + int i; + unsigned int indirect_levels; + + if (frames[0].bh == NULL) + return; + + info = &((struct dx_root *)frames[0].bh->b_data)->info; + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired continuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); + p++; + brelse(p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *top; + int err = 0, count = 0; + struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; + int csum = ext4_has_feature_metadata_csum(dir->i_sb); + + dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", + (unsigned long)block)); + bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + de = (struct ext4_dir_entry_2 *) bh->b_data; + /* csum entries are not larger in the casefolded encrypted case */ + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + ext4_dir_rec_len(0, + csum ? NULL : dir)); + /* Check if the directory is encrypted */ + if (IS_ENCRYPTED(dir)) { + err = fscrypt_prepare_readdir(dir); + if (err < 0) { + brelse(bh); + return err; + } + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + brelse(bh); + return err; + } + } + + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, + (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + + ((char *)de - bh->b_data))) { + /* silently ignore the rest of the block */ + break; + } + if (ext4_hash_in_dirent(dir)) { + if (de->name_len && de->inode) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hinfo->hash = 0; + hinfo->minor_hash = 0; + } + } else { + err = ext4fs_dirhash(dir, de->name, + de->name_len, hinfo); + if (err < 0) { + count = err; + goto errout; + } + } + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if (!IS_ENCRYPTED(dir)) { + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { + int save_len = fname_crypto_str.len; + struct fscrypt_str de_name = FSTR_INIT(de->name, + de->name_len); + + /* Directory is encrypted */ + err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, + hinfo->minor_hash, &de_name, + &fname_crypto_str); + if (err) { + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); + fname_crypto_str.len = save_len; + } + if (err != 0) { + count = err; + goto errout; + } + count++; + } +errout: + brelse(bh); + fscrypt_fname_free_buffer(&fname_crypto_str); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; + int ret, err; + __u32 hashval; + struct fscrypt_str tmp_str; + + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = file_inode(dir_file); + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { + if (ext4_hash_in_dirent(dir)) + hinfo.hash_version = DX_HASH_SIPHASH; + else + hinfo.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = ext4_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 0, 0, + de, &tmp_str); + if (err != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + de = ext4_next_entry(de, dir->i_sb->s_blocksize); + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 2, 0, + de, &tmp_str); + if (err != 0) + goto errout; + count++; + } + + while (1) { + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, offset, res_dir); +} + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) +{ + int count = 0; + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; + unsigned int buflen = bh->b_size; + char *base = bh->b_data; + struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); + + if (ext4_has_feature_metadata_csum(dir->i_sb)) + buflen -= sizeof(struct ext4_dir_entry_tail); + + while ((char *) de < base + buflen) { + if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, + ((char *)de) - base)) + return -EFSCORRUPTED; + if (de->name_len && de->inode) { + if (ext4_hash_in_dirent(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else { + int err = ext4fs_dirhash(dir, de->name, + de->name_len, &h); + if (err < 0) + return err; + } + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = ((char *) de - base)>>2; + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); + count++; + cond_resched(); + } + de = ext4_next_entry(de, blocksize); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + ASSERT(count < dx_get_limit(entries)); + ASSERT(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +#if IS_ENABLED(CONFIG_UNICODE) +int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + struct ext4_filename *name) +{ + struct qstr *cf_name = &name->cf_name; + unsigned char *buf; + struct dx_hash_info *hinfo = &name->hinfo; + int len; + + if (!IS_CASEFOLDED(dir) || + (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { + cf_name->name = NULL; + return 0; + } + + buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); + if (len <= 0) { + kfree(buf); + buf = NULL; + } + cf_name->name = buf; + cf_name->len = (unsigned) len; + + if (!IS_ENCRYPTED(dir)) + return 0; + + hinfo->hash_version = DX_HASH_SIPHASH; + hinfo->seed = NULL; + if (cf_name->name) + return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + else + return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); +} +#endif + +/* + * Test whether a directory entry matches the filename being searched for. + * + * Return: %true if the directory entry matches, otherwise %false. + */ +static bool ext4_match(struct inode *parent, + const struct ext4_filename *fname, + struct ext4_dir_entry_2 *de) +{ + struct fscrypt_name f; + + if (!de->inode) + return false; + + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + f.crypto_buf = fname->crypto_buf; +#endif + +#if IS_ENABLED(CONFIG_UNICODE) + if (IS_CASEFOLDED(parent) && + (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { + /* + * Just checking IS_ENCRYPTED(parent) below is not + * sufficient to decide whether one can use the hash for + * skipping the string comparison, because the key might + * have been added right after + * ext4_fname_setup_ci_filename(). In this case, a hash + * mismatch will be a false negative. Therefore, make + * sure cf_name was properly initialized before + * considering the calculated hash. + */ + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && + (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) + return false; + /* + * Treat comparison errors as not a match. The + * only case where it happens is on a disk + * corruption or ENOMEM. + */ + + return generic_ci_match(parent, fname->usr_fname, + &fname->cf_name, de->name, + de->name_len) > 0; + } +#endif + + return fscrypt_match_name(&f, de->name, de->name_len); +} + +/* + * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success + */ +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) +{ + struct ext4_dir_entry_2 * de; + char * dlimit; + int de_len; + + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + if (de->name + de->name_len <= dlimit && + ext4_match(dir, fname, de)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, + buf_size, offset)) + return -EFSCORRUPTED; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + if (de_len <= 0) + return -EFSCORRUPTED; + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} + +/* + * __ext4_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head *__ext4_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *inlined) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block; + const u8 *name = fname->usr_fname->name; + size_t ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + size_t ra_ptr = 0; /* Current index into readahead + buffer */ + ext4_lblk_t nblocks; + int i, namelen, retval; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = fname->usr_fname->len; + if (namelen > EXT4_NAME_LEN) + return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, fname, res_dir, + &has_inline_data); + if (inlined) + *inlined = has_inline_data; + if (has_inline_data || IS_ERR(ret)) + goto cleanup_and_exit; + } + + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + ret = ext4_dx_find_entry(dir, fname, res_dir); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else + goto cleanup_and_exit; + ret = NULL; + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { + ret = NULL; + goto cleanup_and_exit; + } + start = EXT4_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + cond_resched(); + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + if (block < start) + ra_max = start - block; + else + ra_max = nblocks - block; + ra_max = min(ra_max, ARRAY_SIZE(bh_use)); + retval = ext4_bread_batch(dir, block, ra_max, + false /* wait */, bh_use); + if (retval) { + ret = ERR_PTR(retval); + ra_max = 0; + goto cleanup_and_exit; + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + EXT4_ERROR_INODE_ERR(dir, EIO, + "reading directory lblock %lu", + (unsigned long) block); + brelse(bh); + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; + } + if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && + !ext4_dirblock_csum_verify(dir, bh)) { + EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, + "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + ret = ERR_PTR(-EFSBADCRC); + goto cleanup_and_exit; + } + set_buffer_verified(bh); + i = search_dirblock(bh, dir, fname, + block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT4_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) { + ret = ERR_PTR(i); + goto cleanup_and_exit; + } + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head *ext4_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *inlined) +{ + int err; + struct ext4_filename fname; + struct buffer_head *bh; + + err = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (err == -ENOENT) + return NULL; + if (err) + return ERR_PTR(err); + + bh = __ext4_find_entry(dir, &fname, res_dir, inlined); + + ext4_fname_free_filename(&fname); + return bh; +} + +static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct dentry *dentry, + struct ext4_dir_entry_2 **res_dir) +{ + int err; + struct ext4_filename fname; + struct buffer_head *bh; + + err = ext4_fname_prepare_lookup(dir, dentry, &fname); + if (err == -ENOENT) + return NULL; + if (err) + return ERR_PTR(err); + + bh = __ext4_find_entry(dir, &fname, res_dir, NULL); + + ext4_fname_free_filename(&fname); + return bh; +} + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) +{ + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; + +#ifdef CONFIG_FS_ENCRYPTION + *res_dir = NULL; +#endif + frame = dx_probe(fname, dir, NULL, frames); + if (IS_ERR(frame)) + return ERR_CAST(frame); + do { + block = dx_get_block(frame->at); + bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); + if (IS_ERR(bh)) + goto errout; + + retval = search_dirblock(bh, dir, fname, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) + goto success; + brelse(bh); + if (retval < 0) { + bh = ERR_PTR(ERR_BAD_DX_DIR); + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", + retval); + bh = ERR_PTR(retval); + goto errout; + } + } while (retval == 1); + + bh = NULL; +errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); +success: + dx_release(frames); + return bh; +} + +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; + + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext4_lookup_entry(dir, dentry, &de); + if (IS_ERR(bh)) + return ERR_CAST(bh); + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); + brelse(bh); + if (!ext4_valid_inum(dir->i_sb, ino)) { + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); + return ERR_PTR(-EFSCORRUPTED); + } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); + return ERR_PTR(-EFSCORRUPTED); + } + inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); + if (inode == ERR_PTR(-ESTALE)) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EFSCORRUPTED); + } + if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + iput(inode); + return ERR_PTR(-EPERM); + } + } + + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { + /* Eventually we want to call d_add_ci(dentry, NULL) + * for negative dentries in the encoding case as + * well. For now, prevent the negative dentry + * from being cached. + */ + return NULL; + } + + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext4_get_parent(struct dentry *child) +{ + __u32 ino; + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext4_valid_inum(child->d_sb, ino)) { + EXT4_ERROR_INODE(d_inode(child), + "bad parent inode number: %u", ino); + return ERR_PTR(-EFSCORRUPTED); + } + + return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext4_dir_entry_2 * +dx_move_dirents(struct inode *dir, char *from, char *to, + struct dx_map_entry *map, int count, + unsigned blocksize) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); + rec_len = ext4_dir_rec_len(de->name_len, dir); + + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); + + /* wipe dir_entry excluding the rec_len field */ + de->inode = 0; + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + + map++; + to += rec_len; + } + return (struct ext4_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize) +{ + struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { + rec_len = ext4_dir_rec_len(de->name_len, dir); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + prev = to; + to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned continued; + int count; + struct buffer_head *bh2; + ext4_lblk_t newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext4_dir_entry_2 *de = NULL, *de2; + int csum_size = 0; + int err = 0, i; + + if (ext4_has_feature_metadata_csum(dir->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + brelse(*bh); + *bh = NULL; + return ERR_CAST(bh2); + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, + EXT4_JTR_NONE); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, + EXT4_JTR_NONE); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map(dir, *bh, hinfo, map); + if (count < 0) { + err = count; + goto journal_error; + } + map -= count; + dx_sort_map(map, count); + /* Ensure that neither split block is over half full */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* + * map index at which we will split + * + * If the sum of active entries didn't exceed half the block size, just + * split it in half by count; each resulting block will have at least + * half the space free. + */ + if (i >= 0) + split = count - move; + else + split = count/2; + + if (WARN_ON_ONCE(split == 0)) { + /* Should never happen, but avoid out-of-bounds access below */ + ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, + "bad indexed directory? hash=%08x:%08x count=%d move=%u", + hinfo->hash, hinfo->minor_hash, count, move); + err = -EFSCORRUPTED; + goto out; + } + + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", + (unsigned long)dx_get_block(frame->at), + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, + blocksize); + de = dx_pack_dirents(dir, data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de2, + blocksize); + if (csum_size) { + ext4_initialize_dirent_tail(*bh, blocksize); + ext4_initialize_dirent_tail(bh2, blocksize); + } + + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, + blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) { + swap(*bh, bh2); + de = de2; + } + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (err) + goto journal_error; + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (err) + goto journal_error; + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); + return de; + +journal_error: + ext4_std_error(dir->i_sb, err); +out: + brelse(*bh); + brelse(bh2); + *bh = NULL; + return ERR_PTR(err); +} + +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; + nlen = ext4_dir_rec_len(de->name_len, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + + *dest_de = de; + return 0; +} + +void ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) +{ + + int nlen, rlen; + + nlen = ext4_dir_rec_len(de->name_len, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); + if (ext4_hash_in_dirent(dir)) { + struct dx_hash_info *hinfo = &fname->hinfo; + + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } +} + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, + struct buffer_head *bh) +{ + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; + int err, err2; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { + err = ext4_find_dest_de(dir, bh, bh->b_data, + blocksize - csum_size, fname, &de); + if (err) + return err; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); + if (err) { + ext4_std_error(dir->i_sb, err); + return err; + } + + /* By now the buffer is marked for journaling */ + ext4_insert_dentry(dir, inode, de, blocksize, fname); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + ext4_update_dx_flag(dir); + inode_inc_iversion(dir); + err2 = ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirblock(handle, dir, bh); + if (err) + ext4_std_error(dir->i_sb, err); + return err ? err : err2; +} + +static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) +{ + struct fake_dirent *fde; + const char *error_msg; + unsigned int rlen; + unsigned int blocksize = dir->i_sb->s_blocksize; + char *blockend = (char *)root + dir->i_sb->s_blocksize; + + fde = &root->dot; + if (unlikely(fde->name_len != 1)) { + error_msg = "invalid name_len for '.'"; + goto corrupted; + } + if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { + error_msg = "invalid name for '.'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '.'"; + goto corrupted; + } + + fde = &root->dotdot; + if (unlikely(fde->name_len != 2)) { + error_msg = "invalid name_len for '..'"; + goto corrupted; + } + if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { + error_msg = "invalid name for '..'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '..'"; + goto corrupted; + } + + return true; + +corrupted: + EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", + error_msg); + return false; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, + struct inode *inode, struct buffer_head *bh) +{ + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2; + char *data2, *top; + unsigned len; + int retval; + unsigned blocksize; + ext4_lblk_t block; + struct fake_dirent *fde; + int csum_size = 0; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + BUFFER_TRACE(bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); + if (retval) { + ext4_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + + root = (struct dx_root *) bh->b_data; + if (!ext4_check_dx_root(dir, root)) { + brelse(bh); + return -EFSCORRUPTED; + } + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len, blocksize)); + len = ((char *) root) + (blocksize - csum_size) - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block); + if (IS_ERR(bh2)) { + brelse(bh); + return PTR_ERR(bh2); + } + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); + data2 = bh2->b_data; + + memcpy(data2, de, len); + memset(de, 0, len); /* wipe old data */ + de = (struct ext4_dir_entry_2 *) data2; + top = data2 + len; + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, + (char *)de - data2)) { + brelse(bh2); + brelse(bh); + return -EFSCORRUPTED; + } + de = de2; + } + de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de, blocksize); + + if (csum_size) + ext4_initialize_dirent_tail(bh2, blocksize); + + /* Initialize the root; the dot dirents already exist */ + de = (struct ext4_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext4_rec_len_to_disk( + blocksize - ext4_dir_rec_len(2, NULL), blocksize); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + if (ext4_hash_in_dirent(dir)) + root->info.hash_version = DX_HASH_SIPHASH; + else + root->info.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; + + entries = root->entries; + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + + /* casefolded encrypted hashes are computed on fname setup */ + if (!ext4_hash_in_dirent(dir)) { + int err = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); + if (err < 0) { + brelse(bh2); + brelse(bh); + return err; + } + } + memset(frames, 0, sizeof(frames)); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + + retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (retval) + goto out_frames; + retval = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (retval) + goto out_frames; + + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; + } + + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); +out_frames: + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + if (retval) + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); + brelse(bh2); + return retval; +} + +/* + * ext4_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext4_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct ext4_filename fname; + int retval; + int dx_fallback=0; + unsigned blocksize; + ext4_lblk_t block, blocks; + int csum_size = 0; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; + + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); + if (retval < 0) + goto out; + if (retval == 1) { + retval = 0; + goto out; + } + } + + if (is_dx(dir)) { + retval = ext4_dx_add_entry(handle, &fname, dir, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; + /* Can we just ignore htree data? */ + if (ext4_has_feature_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, + "Directory has corrupted htree index."); + retval = -EFSCORRUPTED; + goto out; + } + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + retval = ext4_mark_inode_dirty(handle, dir); + if (unlikely(retval)) + goto out; + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + bh = ext4_read_dirblock(dir, block, DIRENT); + if (bh == NULL) { + bh = ext4_bread(handle, dir, block, + EXT4_GET_BLOCKS_CREATE); + goto add_to_new_block; + } + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); + if (retval != -ENOSPC) + goto out; + + if (blocks == 1 && !dx_fallback && + ext4_has_feature_dir_index(sb)) { + retval = make_indexed_dir(handle, &fname, dir, + inode, bh); + bh = NULL; /* make_indexed_dir releases bh */ + goto out; + } + brelse(bh); + } + bh = ext4_append(handle, dir, &block); +add_to_new_block: + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); + + if (csum_size) + ext4_initialize_dirent_tail(bh, blocksize); + + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); +out: + ext4_fname_free_filename(&fname); + brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode) +{ + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct buffer_head *bh; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; + int restart; + int err; + +again: + restart = 0; + frame = dx_probe(fname, dir, NULL, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; + at = frame->at; + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto cleanup; + } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + + err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } + err = -ENOSPC; + goto cleanup; + } + icount = dx_get_count(entries); + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); + goto cleanup; + } + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, frame->bh, + EXT4_JTR_NONE); + if (err) { + brelse(bh2); + goto journal_error; + } + if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, sb, + (frame - 1)->bh, + EXT4_JTR_NONE); + if (err) { + brelse(bh2); + goto journal_error; + } + + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + if (err) { + brelse(bh2); + goto journal_error; + } + brelse (bh2); + err = ext4_handle_dirty_dx_node(handle, dir, + (frame - 1)->bh); + if (err) + goto journal_error; + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + if (restart || err) + goto journal_error; + } else { + struct dx_root *dxroot; + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + dxroot->info.indirect_levels)); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (err) { + brelse(bh2); + goto journal_error; + } + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto journal_error; + } + } + de = do_split(handle, dir, &bh, frame, &fname->hinfo); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; + } + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); + goto cleanup; + +journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ +cleanup: + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; + return err; +} + +/* + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry + */ +int ext4_generic_delete_entry(struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) +{ + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; + int i; + + i = 0; + pde = NULL; + de = entry_buf; + while (i < buf_size - csum_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + entry_buf, buf_size, i)) + return -EFSCORRUPTED; + if (de == de_del) { + if (pde) { + pde->rec_len = ext4_rec_len_to_disk( + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); + + /* wipe entire dir_entry */ + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize)); + } else { + /* wipe dir_entry excluding the rec_len field */ + de->inode = 0; + memset(&de->name_len, 0, + ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + } + + inode_inc_iversion(dir); + return 0; + } + i += ext4_rec_len_from_disk(de->rec_len, blocksize); + pde = de; + de = ext4_next_entry(de, blocksize); + } + return -ENOENT; +} + +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (ext4_has_feature_metadata_csum(dir->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirblock(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 + * since this indicates that nlinks count was previously 1 to avoid overflowing + * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean + * that subdirectory link counts are not being maintained accurately. + * + * The caller has already checked for i_nlink overflow in case the DIR_LINK + * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +static void ext4_inc_count(struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + + +/* + * Add non-directory inode to a directory. On success, the inode reference is + * consumed by dentry is instantiation. This is also indicated by clearing of + * *inodep pointer. On failure, the caller is responsible for dropping the + * inode reference in the safe context. + */ +static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode **inodep) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = *inodep; + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + err = ext4_mark_inode_dirty(handle, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + d_instantiate_new(dentry, inode); + *inodep = NULL; + return err; + } + drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_orphan_add(handle, inode); + unlock_new_inode(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + err = dquot_initialize(dir); + if (err) + return err; + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_add_nondir(handle, dentry, &inode); + if (!err) + ext4_fc_track_create(handle, dentry); + } + if (handle) + ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + err = dquot_initialize(dir); + if (err) + return err; + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &ext4_special_inode_operations; + err = ext4_add_nondir(handle, dentry, &inode); + if (!err) + ext4_fc_track_create(handle, dentry); + } + if (handle) + ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + err = dquot_initialize(dir); + if (err) + return err; + +retry: + inode = ext4_new_inode_start_handle(idmap, dir, mode, + NULL, 0, NULL, + EXT4_HT_DIR, + EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + + 4 + EXT4_XATTR_TRANS_BLOCKS); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + d_tmpfile(file, inode); + err = ext4_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return finish_open_simple(file, err); +err_unlock_inode: + ext4_journal_stop(handle); + unlock_new_inode(inode); + return err; +} + +int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh, unsigned int parent_ino, + void *inline_buf, int inline_size) +{ + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; + size_t blocksize = bh->b_size; + int csum_size = 0, header_size; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), + blocksize); + memcpy(de->name, ".", 2); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; + memcpy(de->name, "..", 3); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + if (inline_buf) { + de->rec_len = ext4_rec_len_to_disk( + ext4_dir_rec_len(de->name_len, NULL), + blocksize); + de = ext4_next_entry(de, blocksize); + header_size = (char *)de - bh->b_data; + memcpy((void *)de, inline_buf, inline_size); + ext4_update_final_de(bh->b_data, inline_size + header_size, + blocksize - csum_size); + } else { + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + ext4_dir_rec_len(1, NULL)), + blocksize); + } + + if (csum_size) + ext4_initialize_dirent_tail(bh, blocksize); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + return ext4_handle_dirty_dirblock(handle, inode, bh); +} + +int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) +{ + struct buffer_head *dir_block = NULL; + ext4_lblk_t block = 0; + int err; + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + + set_nlink(inode, 2); + inode->i_size = 0; + dir_block = ext4_append(handle, inode, &block); + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); +out: + brelse(dir_block); + return err; +} + +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, err2 = 0, credits, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return ERR_PTR(-EMLINK); + + err = dquot_initialize(dir); + if (err) + return ERR_PTR(err); + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, + &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + err = ext4_init_new_dir(handle, dir, inode); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); + if (err) { +out_clear_inode: + clear_nlink(inode); + ext4_orphan_add(handle, inode); + unlock_new_inode(inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2)) + err = err2; + ext4_journal_stop(handle); + iput(inode); + goto out_retry; + } + ext4_inc_count(dir); + + ext4_update_dx_flag(dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + d_instantiate_new(dentry, inode); + ext4_fc_track_create(handle, dentry); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + +out_stop: + if (handle) + ext4_journal_stop(handle); +out_retry: + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return ERR_PTR(err); +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +bool ext4_empty_dir(struct inode *inode) +{ + unsigned int offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + int ret; + + ret = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return ret; + } + + sb = inode->i_sb; + if (inode->i_size < ext4_dir_rec_len(1, NULL) + + ext4_dir_rec_len(2, NULL)) { + EXT4_ERROR_INODE(inode, "invalid size"); + return false; + } + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) + return false; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + 0) || + le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || + de->name[0] != '.') { + ext4_warning_inode(inode, "directory missing '.'"); + brelse(bh); + return false; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + offset) || + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { + ext4_warning_inode(inode, "directory missing '..'"); + brelse(bh); + return false; + } + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + while (offset < inode->i_size) { + if (!(offset & (sb->s_blocksize - 1))) { + unsigned int lblock; + brelse(bh); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_read_dirblock(inode, lblock, EITHER); + if (bh == NULL) { + offset += sb->s_blocksize; + continue; + } + if (IS_ERR(bh)) + return false; + } + de = (struct ext4_dir_entry_2 *) (bh->b_data + + (offset & (sb->s_blocksize - 1))); + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset) || + le32_to_cpu(de->inode)) { + brelse(bh); + return false; + } + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + } + brelse(bh); + return true; +} + +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle = NULL; + + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + retval = dquot_initialize(dir); + if (retval) + return retval; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + return retval; + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + goto end_rmdir; + + inode = d_inode(dentry); + + retval = -EFSCORRUPTED; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!ext4_empty_dir(inode)) + goto end_rmdir; + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rmdir; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (!EXT4_DIR_LINK_EMPTY(inode)) + ext4_warning_inode(inode, + "empty directory '%.*s' has too many links (%u)", + dentry->d_name.len, dentry->d_name.name, + inode->i_nlink); + inode_inc_iversion(inode); + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_set_ctime_current(inode); + retval = ext4_mark_inode_dirty(handle, inode); + if (retval) + goto end_rmdir; + ext4_dec_count(dir); + ext4_update_dx_flag(dir); + ext4_fc_track_unlink(handle, dentry); + retval = ext4_mark_inode_dirty(handle, dir); + + /* VFS negative dentries are incompatible with Encoding and + * Case-insensitiveness. Eventually we'll want avoid + * invalidating the dentries here, alongside with returning the + * negative dentries at ext4_lookup(), when it is better + * supported by the VFS for the CI case. + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + +end_rmdir: + brelse(bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, + struct dentry *dentry /* NULL during fast_commit recovery */) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + int skip_remove_dentry = 0; + + /* + * Keep this outside the transaction; it may have to set up the + * directory's encryption key, which isn't GFP_NOFS-safe. + */ + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + if (!bh) + return -ENOENT; + + if (le32_to_cpu(de->inode) != inode->i_ino) { + /* + * It's okay if we find dont find dentry which matches + * the inode. That's because it might have gotten + * renamed to a different inode number + */ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + skip_remove_dentry = 1; + else + goto out_bh; + } + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_bh; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + if (!skip_remove_dentry) { + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto out_handle; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + ext4_update_dx_flag(dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (retval) + goto out_handle; + } else { + retval = 0; + } + if (inode->i_nlink == 0) + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + d_name->len, d_name->name); + else + drop_nlink(inode); + if (!inode->i_nlink) + ext4_orphan_add(handle, inode); + inode_set_ctime_current(inode); + retval = ext4_mark_inode_dirty(handle, inode); + if (dentry && !retval) + ext4_fc_track_unlink(handle, dentry); +out_handle: + ext4_journal_stop(handle); +out_bh: + brelse(bh); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; + + trace_ext4_unlink_enter(dir, dentry); + /* + * Initialize quotas before so that eventual writes go + * in separate transaction + */ + retval = dquot_initialize(dir); + if (retval) + goto out_trace; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + goto out_trace; + + retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); + + /* VFS negative dentries are incompatible with Encoding and + * Case-insensitiveness. Eventually we'll want avoid + * invalidating the dentries here, alongside with returning the + * negative dentries at ext4_lookup(), when it is better + * supported by the VFS for the CI case. + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + +out_trace: + trace_ext4_unlink_exit(dentry, retval); + return retval; +} + +static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, + struct fscrypt_str *disk_link) +{ + struct buffer_head *bh; + char *kaddr; + int err = 0; + + bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); + if (err) + goto out; + + kaddr = (char *)bh->b_data; + memcpy(kaddr, disk_link->name, disk_link->len); + inode->i_size = disk_link->len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_handle_dirty_metadata(handle, inode, bh); +out: + brelse(bh); + return err; +} + +static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + handle_t *handle; + struct inode *inode; + int err, len = strlen(symname); + int credits; + struct fscrypt_str disk_link; + int retries = 0; + + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; + + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; + + err = dquot_initialize(dir); + if (err) + return err; + + /* + * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the + * directory. +3 for inode, inode bitmap, group descriptor allocation. + * EXT4_DATA_TRANS_BLOCKS for the data block allocation and + * modification. + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; +retry: + inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + if (IS_ERR(inode)) { + if (handle) + ext4_journal_stop(handle); + err = PTR_ERR(inode); + goto out_retry; + } + + if (IS_ENCRYPTED(inode)) { + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_drop_inode; + inode->i_op = &ext4_encrypted_symlink_inode_operations; + } else { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + inode->i_op = &ext4_symlink_inode_operations; + } else { + inode->i_op = &ext4_fast_symlink_inode_operations; + } + } + + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + /* alloc symlink block and fill it */ + err = ext4_init_symlink_block(handle, inode, &disk_link); + if (err) + goto err_drop_inode; + } else { + /* clear the extent format for fast symlink */ + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, + disk_link.len); + inode->i_size = disk_link.len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); + } + err = ext4_add_nondir(handle, dentry, &inode); + if (handle) + ext4_journal_stop(handle); + iput(inode); + goto out_retry; + +err_drop_inode: + clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_orphan_add(handle, inode); + unlock_new_inode(inode); + if (handle) + ext4_journal_stop(handle); + iput(inode); +out_retry: + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); + return err; +} + +int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) +{ + handle_t *handle; + int err, retries = 0; +retry: + handle = ext4_journal_start(dir, EXT4_HT_DIR, + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode_set_ctime_current(inode); + ext4_inc_count(inode); + ihold(inode); + + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + err = ext4_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext4_orphan_del(handle, inode); + d_instantiate(dentry, inode); + ext4_fc_track_link(handle, dentry); + } else { + drop_nlink(inode); + iput(inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int err; + + if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + return __ext4_link(dir, inode, dentry); +} + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + struct ext4_dir_entry_2 *de; + unsigned int offset; + + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) { + *retval = PTR_ERR(bh); + return NULL; + } + + de = (struct ext4_dir_entry_2 *) bh->b_data; + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, 0) || + le32_to_cpu(de->inode) != inode->i_ino || + de->name_len != 1 || de->name[0] != '.') { + EXT4_ERROR_INODE(inode, "directory missing '.'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + offset = ext4_rec_len_from_disk(de->rec_len, + inode->i_sb->s_blocksize); + de = ext4_next_entry(de, inode->i_sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, offset) || + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { + EXT4_ERROR_INODE(inode, "directory missing '..'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + *parent_de = de; + + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} + +struct ext4_renament { + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool is_dir; + int dir_nlink_delta; + + /* entry for "dentry" */ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + int inlined; + + /* entry for ".." in inode if it's a directory */ + struct buffer_head *dir_bh; + struct ext4_dir_entry_2 *parent_de; + int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) +{ + int retval; + + ent->is_dir = true; + if (!is_cross) + return 0; + + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, + &retval, &ent->parent_de, + &ent->dir_inlined); + if (!ent->dir_bh) + return retval; + if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) + return -EFSCORRUPTED; + BUFFER_TRACE(ent->dir_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, ent->dir->i_sb, + ent->dir_bh, EXT4_JTR_NONE); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, + unsigned dir_ino) +{ + int retval; + + if (!ent->dir_bh) + return 0; + + ent->parent_de->inode = cpu_to_le32(dir_ino); + BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); + if (!ent->dir_inlined) { + if (is_dx(ent->inode)) { + retval = ext4_handle_dirty_dx_node(handle, + ent->inode, + ent->dir_bh); + } else { + retval = ext4_handle_dirty_dirblock(handle, ent->inode, + ent->dir_bh); + } + } else { + retval = ext4_mark_inode_dirty(handle, ent->inode); + } + if (retval) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + int retval, retval2; + + BUFFER_TRACE(ent->bh, "get write access"); + retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, + EXT4_JTR_NONE); + if (retval) + return retval; + ent->de->inode = cpu_to_le32(ino); + if (ext4_has_feature_filetype(ent->dir->i_sb)) + ent->de->file_type = file_type; + inode_inc_iversion(ent->dir); + inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); + retval = ext4_mark_inode_dirty(handle, ent->dir); + BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); + if (!ent->inlined) { + retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); + if (unlikely(retval2)) { + ext4_std_error(ent->dir->i_sb, retval2); + return retval2; + } + } + return retval; +} + +static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + struct ext4_renament old = *ent; + int retval = 0; + + /* + * old->de could have moved from under us during make indexed dir, + * so the old->de may no longer valid and need to find it again + * before reset old inode info. + */ + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, + &old.inlined); + if (IS_ERR(old.bh)) + retval = PTR_ERR(old.bh); + if (!old.bh) + retval = -ENOENT; + if (retval) { + ext4_std_error(old.dir->i_sb, retval); + return; + } + + ext4_setent(handle, &old, ino, file_type); + brelse(old.bh); +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (bh) { + retval = ext4_delete_entry(handle, dir, de, bh); + brelse(bh); + } + return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, + int force_reread) +{ + int retval; + /* + * ent->de could have moved from under us during htree split, so make + * sure that we are deleting the right entry. We might also be pointing + * to a stale entry in the unused part of ent->bh so just checking inum + * and the name isn't enough. + */ + if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || + ent->de->name_len != ent->dentry->d_name.len || + strncmp(ent->de->name, ent->dentry->d_name.name, + ent->de->name_len) || + force_reread) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } else { + retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); + if (retval == -ENOENT) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } + } + + if (retval) { + ext4_warning_inode(ent->dir, + "Deleting old file: nlink %d, error=%d", + ent->dir->i_nlink, retval); + } +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ + if (ent->dir_nlink_delta) { + if (ent->dir_nlink_delta == -1) + ext4_dec_count(ent->dir); + else + ext4_inc_count(ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + } +} + +static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, + struct ext4_renament *ent, + int credits, handle_t **h) +{ + struct inode *wh; + handle_t *handle; + int retries = 0; + + /* + * for inode block, sb block, group summaries, + * and inode bitmap + */ + credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS + 4); +retry: + wh = ext4_new_inode_start_handle(idmap, ent->dir, + S_IFCHR | WHITEOUT_MODE, + &ent->dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + + handle = ext4_journal_current_handle(); + if (IS_ERR(wh)) { + if (handle) + ext4_journal_stop(handle); + if (PTR_ERR(wh) == -ENOSPC && + ext4_should_retry_alloc(ent->dir->i_sb, &retries)) + goto retry; + } else { + *h = handle; + init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); + wh->i_op = &ext4_special_inode_operations; + } + return wh; +} + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) + */ +static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = d_inode(old_dentry), + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = d_inode(new_dentry), + }; + int force_reread; + int retval; + struct inode *whiteout = NULL; + int credits; + u8 old_file_type; + + if (new.inode && new.inode->i_nlink == 0) { + EXT4_ERROR_INODE(new.inode, + "target of rename is already freed"); + return -EFSCORRUPTED; + } + + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(old.inode); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new.inode) { + retval = dquot_initialize(new.inode); + if (retval) + return retval; + } + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, + &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); + + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto release_bh; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto release_bh; + } + if (new.bh) { + if (!new.inode) { + brelse(new.bh); + new.bh = NULL; + } + } + if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old.inode); + + credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (!(flags & RENAME_WHITEOUT)) { + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto release_bh; + } + } else { + whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); + if (IS_ERR(whiteout)) { + retval = PTR_ERR(whiteout); + goto release_bh; + } + } + + old_file_type = old.de->file_type; + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + if (new.inode) { + retval = -ENOTEMPTY; + if (!ext4_empty_dir(new.inode)) + goto end_rename; + } else { + retval = -EMLINK; + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) + goto end_rename; + } + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); + if (retval) + goto end_rename; + } + /* + * If we're renaming a file within an inline_data dir and adding or + * setting the new dirent causes a conversion from inline_data to + * extents/blockmap, we need to force the dirent delete code to + * re-read the directory, or else we end up trying to delete a dirent + * from what is now the extent tree root (or a block map). + */ + force_reread = (new.dir->i_ino == old.dir->i_ino && + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); + + if (whiteout) { + /* + * Do this before adding a new entry, so the old entry is sure + * to be still pointing to the valid old entry. + */ + retval = ext4_setent(handle, &old, whiteout->i_ino, + EXT4_FT_CHRDEV); + if (retval) + goto end_rename; + retval = ext4_mark_inode_dirty(handle, whiteout); + if (unlikely(retval)) + goto end_rename; + + } + if (!new.bh) { + retval = ext4_add_entry(handle, new.dentry, old.inode); + if (retval) + goto end_rename; + } else { + retval = ext4_setent(handle, &new, + old.inode->i_ino, old_file_type); + if (retval) + goto end_rename; + } + if (force_reread) + force_reread = !ext4_test_inode_flag(new.dir, + EXT4_INODE_INLINE_DATA); + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + inode_set_ctime_current(old.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; + + if (!whiteout) { + /* + * ok, that's it + */ + ext4_rename_delete(handle, &old, force_reread); + } + + if (new.inode) { + ext4_dec_count(new.inode); + inode_set_ctime_current(new.inode); + } + inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); + ext4_update_dx_flag(old.dir); + if (old.is_dir) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + + ext4_dec_count(old.dir); + if (new.inode) { + /* checked ext4_empty_dir above, can't have another + * parent, ext4_dec_count() won't work for many-linked + * dirs */ + clear_nlink(new.inode); + } else { + ext4_inc_count(new.dir); + ext4_update_dx_flag(new.dir); + retval = ext4_mark_inode_dirty(handle, new.dir); + if (unlikely(retval)) + goto end_rename; + } + } + retval = ext4_mark_inode_dirty(handle, old.dir); + if (unlikely(retval)) + goto end_rename; + + if (old.is_dir) { + /* + * We disable fast commits here that's because the + * replay code is not yet capable of changing dot dot + * dirents in directories. + */ + ext4_fc_mark_ineligible(old.inode->i_sb, + EXT4_FC_REASON_RENAME_DIR, handle); + } else { + struct super_block *sb = old.inode->i_sb; + + if (new.inode) + ext4_fc_track_unlink(handle, new.dentry); + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && + !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) { + __ext4_fc_track_link(handle, old.inode, new.dentry); + __ext4_fc_track_unlink(handle, old.inode, old.dentry); + if (whiteout) + __ext4_fc_track_create(handle, whiteout, + old.dentry); + } + } + + if (new.inode) { + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; + if (!new.inode->i_nlink) + ext4_orphan_add(handle, new.inode); + } + retval = 0; + +end_rename: + if (whiteout) { + if (retval) { + ext4_resetent(handle, &old, + old.inode->i_ino, old_file_type); + drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); + ext4_orphan_add(handle, whiteout); + } + unlock_new_inode(whiteout); + ext4_journal_stop(handle); + iput(whiteout); + } else { + ext4_journal_stop(handle); + } +release_bh: + brelse(old.dir_bh); + brelse(old.bh); + brelse(new.bh); + + return retval; +} + +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = d_inode(old_dentry), + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = d_inode(new_dentry), + }; + u8 new_file_type; + int retval; + + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, + &old.de, &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto end_rename; + } + + /* RENAME_EXCHANGE case: old *and* new must both exist */ + if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) + goto end_rename; + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); + if (retval) + goto end_rename; + } + if (S_ISDIR(new.inode->i_mode)) { + retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); + if (retval) + goto end_rename; + } + + /* + * Other than the special case of overwriting a directory, parents' + * nlink only needs to be modified if this is a cross directory rename. + */ + if (old.dir != new.dir && old.is_dir != new.is_dir) { + old.dir_nlink_delta = old.is_dir ? -1 : 1; + new.dir_nlink_delta = -old.dir_nlink_delta; + retval = -EMLINK; + if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || + (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) + goto end_rename; + } + + new_file_type = new.de->file_type; + retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); + if (retval) + goto end_rename; + + retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); + if (retval) + goto end_rename; + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + inode_set_ctime_current(old.inode); + inode_set_ctime_current(new.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; + ext4_fc_mark_ineligible(new.inode->i_sb, + EXT4_FC_REASON_CROSS_RENAME, handle); + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + } + if (new.dir_bh) { + retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); + if (retval) + goto end_rename; + } + ext4_update_dir_count(handle, &old); + ext4_update_dir_count(handle, &new); + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(new.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_rename2(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + if (flags & RENAME_EXCHANGE) { + return ext4_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + + return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext4_dir_inode_operations = { + .create = ext4_create, + .lookup = ext4_lookup, + .link = ext4_link, + .unlink = ext4_unlink, + .symlink = ext4_symlink, + .mkdir = ext4_mkdir, + .rmdir = ext4_rmdir, + .mknod = ext4_mknod, + .tmpfile = ext4_tmpfile, + .rename = ext4_rename2, + .setattr = ext4_setattr, + .getattr = ext4_getattr, + .listxattr = ext4_listxattr, + .get_inode_acl = ext4_get_acl, + .set_acl = ext4_set_acl, + .fiemap = ext4_fiemap, + .fileattr_get = ext4_fileattr_get, + .fileattr_set = ext4_fileattr_set, +}; + +const struct inode_operations ext4_special_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_getattr, + .listxattr = ext4_listxattr, + .get_inode_acl = ext4_get_acl, + .set_acl = ext4_set_acl, +}; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy super.c from Linux v6.18 fs/ext4 directory. This file implements superblock operations including mount, unmount, and filesystem options. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/super.c | 7516 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 7516 insertions(+) create mode 100644 fs/ext4l/super.c diff --git a/fs/ext4l/super.c b/fs/ext4l/super.c new file mode 100644 index 00000000000..33e7c08c952 --- /dev/null +++ b/fs/ext4l/super.c @@ -0,0 +1,7516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/parser.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/vfs.h> +#include <linux/random.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/seq_file.h> +#include <linux/ctype.h> +#include <linux/log2.h> +#include <linux/crc16.h> +#include <linux/dax.h> +#include <linux/uaccess.h> +#include <linux/iversion.h> +#include <linux/unicode.h> +#include <linux/part_stat.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/fsnotify.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> + +#include "ext4.h" +#include "ext4_extents.h" /* Needed for trace points definition */ +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "mballoc.h" +#include "fsmap.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/ext4.h> + +static struct ext4_lazy_init *ext4_li_info; +static DEFINE_MUTEX(ext4_li_mtx); +static struct ratelimit_state ext4_mount_msg_ratelimit; + +static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); +static void ext4_update_super(struct super_block *sb); +static int ext4_commit_super(struct super_block *sb); +static int ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_sync_fs(struct super_block *sb, int wait); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); +static int ext4_unfreeze(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); +static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum); +static int ext4_validate_options(struct fs_context *fc); +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb); +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); +static int ext4_get_tree(struct fs_context *fc); +static int ext4_reconfigure(struct fs_context *fc); +static void ext4_fc_free(struct fs_context *fc); +static int ext4_init_fs_context(struct fs_context *fc); +static void ext4_kill_sb(struct super_block *sb); +static const struct fs_parameter_spec ext4_param_specs[]; + +/* + * Lock ordering + * + * page fault path: + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start + * -> page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_lock + * sb_start_write -> i_mutex -> transaction start -> page lock -> + * i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> + * page lock + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> + * i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> mmap_lock + * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ + +static const struct fs_context_operations ext4_context_ops = { + .parse_param = ext4_parse_param, + .get_tree = ext4_get_tree, + .reconfigure = ext4_reconfigure, + .free = ext4_fc_free, +}; + + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = ext4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); +#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = ext4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); +#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) + + +static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) +{ + if (simu_fail) { + clear_buffer_uptodate(bh); + unlock_buffer(bh); + return; + } + + /* + * buffer's verified bit is no longer valid after reading from + * disk again due to write out error, clear it to make sure we + * recheck the buffer contents. + */ + clear_buffer_verified(bh); + + bh->b_end_io = end_io ? end_io : end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ | op_flags, bh); +} + +void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return; + } + __ext4_read_bh(bh, op_flags, end_io, simu_fail); +} + +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + __ext4_read_bh(bh, op_flags, end_io, simu_fail); + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} + +int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) +{ + lock_buffer(bh); + if (!wait) { + ext4_read_bh_nowait(bh, op_flags, NULL, false); + return 0; + } + return ext4_read_bh(bh, op_flags, NULL, false); +} + +/* + * This works like __bread_gfp() except it uses ERR_PTR for error + * returns. Currently with sb_bread it's impossible to distinguish + * between ENOMEM and EIO situations (since both result in a NULL + * return. + */ +static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, + sector_t block, + blk_opf_t op_flags, gfp_t gfp) +{ + struct buffer_head *bh; + int ret; + + bh = sb_getblk_gfp(sb, block, gfp); + if (bh == NULL) + return ERR_PTR(-ENOMEM); + if (ext4_buffer_uptodate(bh)) + return bh; + + ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); + if (ret) { + put_bh(bh); + return ERR_PTR(ret); + } + return bh; +} + +struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, + blk_opf_t op_flags) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); +} + +struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); +} + +struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL; + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); +} + +void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) +{ + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT); + + if (likely(bh)) { + if (trylock_buffer(bh)) + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); + brelse(bh); + } +} + +static int ext4_verify_csum_type(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + return es->s_checksum_type == EXT4_CRC32C_CHKSUM; +} + +__le32 ext4_superblock_csum(struct ext4_super_block *es) +{ + int offset = offsetof(struct ext4_super_block, s_checksum); + __u32 csum; + + csum = ext4_chksum(~0, (char *)es, offset); + + return cpu_to_le32(csum); +} + +static int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + return es->s_checksum == ext4_superblock_csum(es); +} + +void ext4_superblock_csum_set(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (!ext4_has_feature_metadata_csum(sb)) + return; + + es->s_checksum = ext4_superblock_csum(es); +} + +ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_block_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_table_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); +} + +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + +void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_table_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); +} + +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + +static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) +{ + now = clamp_val(now, 0, (1ull << 40) - 1); + + *lo = cpu_to_le32(lower_32_bits(now)); + *hi = upper_32_bits(now); +} + +static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) +{ + return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); +} +#define ext4_update_tstamp(es, tstamp) \ + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ + ktime_get_real_seconds()) +#define ext4_get_tstamp(es, tstamp) \ + __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < sbi->s_sb_update_sec)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > sbi->s_sb_update_kb) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); +} + +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + + BUG_ON(txn->t_state == T_FINISHED); + + ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); +} + +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) +{ + struct buffer_head *bh, *head; + struct journal_head *jh; + + bh = head = folio_buffers(folio); + do { + /* + * We have to redirty a page in these cases: + * 1) If buffer is dirty, it means the page was dirty because it + * contains a buffer that needs checkpointing. So the dirty bit + * needs to be preserved so that checkpointing writes the buffer + * properly. + * 2) If buffer is not part of the committing transaction + * (we may have just accidentally come across this buffer because + * inode range tracking is not exact) or if the currently running + * transaction already contains this buffer as well, dirty bit + * needs to be preserved so that the buffer gets writeprotected + * properly on running transaction's commit. + */ + jh = bh2jh(bh); + if (buffer_dirty(bh) || + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; + } while ((bh = bh->b_this_page) != head); + + return false; +} + +static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + struct folio *folio = NULL; + int error; + + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } + + return error; +} + +static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret; + + if (ext4_should_journal_data(jinode->i_vfs_inode)) + ret = ext4_journalled_submit_inode_data_buffers(jinode); + else + ret = ext4_normal_submit_inode_data_buffers(jinode); + return ret; +} + +static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret = 0; + + if (!ext4_should_journal_data(jinode->i_vfs_inode)) + ret = jbd2_journal_finish_inode_data_buffers(jinode); + + return ret; +} + +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +struct ext4_err_translation { + int code; + int errno; +}; + +#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } + +static struct ext4_err_translation err_translation[] = { + EXT4_ERR_TRANSLATE(EIO), + EXT4_ERR_TRANSLATE(ENOMEM), + EXT4_ERR_TRANSLATE(EFSBADCRC), + EXT4_ERR_TRANSLATE(EFSCORRUPTED), + EXT4_ERR_TRANSLATE(ENOSPC), + EXT4_ERR_TRANSLATE(ENOKEY), + EXT4_ERR_TRANSLATE(EROFS), + EXT4_ERR_TRANSLATE(EFBIG), + EXT4_ERR_TRANSLATE(EEXIST), + EXT4_ERR_TRANSLATE(ERANGE), + EXT4_ERR_TRANSLATE(EOVERFLOW), + EXT4_ERR_TRANSLATE(EBUSY), + EXT4_ERR_TRANSLATE(ENOTDIR), + EXT4_ERR_TRANSLATE(ENOTEMPTY), + EXT4_ERR_TRANSLATE(ESHUTDOWN), + EXT4_ERR_TRANSLATE(EFAULT), +}; + +static int ext4_errno_to_code(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(err_translation); i++) + if (err_translation[i].errno == errno) + return err_translation[i].code; + return EXT4_ERR_UNKNOWN; +} + +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* We default to EFSCORRUPTED error... */ + if (error == 0) + error = EFSCORRUPTED; + + spin_lock(&sbi->s_error_lock); + sbi->s_add_error_count++; + sbi->s_last_error_code = error; + sbi->s_last_error_line = line; + sbi->s_last_error_ino = ino; + sbi->s_last_error_block = block; + sbi->s_last_error_func = func; + sbi->s_last_error_time = ktime_get_real_seconds(); + if (!sbi->s_first_error_time) { + sbi->s_first_error_code = error; + sbi->s_first_error_line = line; + sbi->s_first_error_ino = ino; + sbi->s_first_error_block = block; + sbi->s_first_error_func = func; + sbi->s_first_error_time = sbi->s_last_error_time; + } + spin_unlock(&sbi->s_error_lock); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext4, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the jbd2_journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + * + * If force_ro is set, we unconditionally force the filesystem into an + * ABORT|READONLY state, unless the error response on the fs has been set to + * panic in which case we take the easy way out and panic immediately. This is + * used to deal with unrecoverable failures such as journal IO errors or ENOMEM + * at a critical moment in log management. + */ +static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -EIO); + + if (!bdev_read_only(sb->s_bdev)) { + save_error_info(sb, error, ino, block, func, line); + /* + * In case the fs should keep running, we need to writeout + * superblock through the journal. Due to lock ordering + * constraints, it may not be safe to do it right here so we + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. + */ + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); + else + ext4_commit_super(sb); + } + + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); + } + + if (ext4_emergency_ro(sb) || continue_fs) + return; + + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. + */ + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static void update_super_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, + s_sb_upd_work); + journal_t *journal = sbi->s_journal; + handle_t *handle; + + /* + * If the journal is still running, we have to write out superblock + * through the journal to avoid collisions of other journalled sb + * updates. + * + * We use directly jbd2 functions here to avoid recursing back into + * ext4 error handling code during handling of previous errors. + */ + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { + struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err = false; + + handle = jbd2_journal_start(journal, 1); + if (IS_ERR(handle)) + goto write_directly; + if (jbd2_journal_get_write_access(handle, sbh)) { + jbd2_journal_stop(handle); + goto write_directly; + } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + + ext4_update_super(sbi->s_sb); + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + + if (jbd2_journal_dirty_metadata(handle, sbh)) { + jbd2_journal_stop(handle); + goto write_directly; + } + jbd2_journal_stop(handle); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + + return; + } +write_directly: + /* + * Write through journal failed. Write sb directly to get error info + * out and hope for the best. + */ + ext4_commit_super(sbi->s_sb); + ext4_notify_error_sysfs(sbi); +} + +#define ext4_error_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ + "EXT4-fs error") + +void __ext4_error(struct super_block *sb, const char *function, + unsigned int line, bool force_ro, int error, __u64 block, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (unlikely(ext4_emergency_state(sb))) + return; + + trace_ext4_error(sb, function, line); + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + } + fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); + + ext4_handle_error(sb, force_ro, error, 0, block, function, line); +} + +void __ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, int error, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + + if (unlikely(ext4_emergency_state(inode->i_sb))) + return; + + trace_ext4_error(inode->i_sb, function, line); + if (ext4_error_ratelimit(inode->i_sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); + } + fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); + + ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, + function, line); +} + +void __ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct inode *inode = file_inode(file); + char pathname[80], *path; + + if (unlikely(ext4_emergency_state(inode->i_sb))) + return; + + trace_ext4_error(inode->i_sb, function, line); + if (ext4_error_ratelimit(inode->i_sb)) { + path = file_path(file, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); + va_end(args); + } + fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); + + ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, + function, line); +} + +const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EFSCORRUPTED: + errstr = "Corrupt filesystem"; + break; + case -EFSBADCRC: + errstr = "Filesystem failed CRC"; + break; + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext4_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) +{ + char nbuf[16]; + const char *errstr; + + if (unlikely(ext4_emergency_state(sb))) + return; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) + return; + + if (ext4_error_ratelimit(sb)) { + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); + + ext4_handle_error(sb, false, -errno, 0, 0, function, line); +} + +void __ext4_msg(struct super_block *sb, + const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (sb) { + atomic_inc(&EXT4_SB(sb)->s_msg_count); + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), + "EXT4-fs")) + return; + } + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + else + printk("%sEXT4-fs: %pV\n", prefix, &vaf); + va_end(args); +} + +static int ext4_warning_ratelimit(struct super_block *sb) +{ + atomic_inc(&EXT4_SB(sb)->s_warning_count); + return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning"); +} + +void __ext4_warning(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); + va_end(args); +} + +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(inode->i_sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + function, line, inode->i_ino, current->comm, &vaf); + va_end(args); +} + +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + struct va_format vaf; + va_list args; + + if (unlikely(ext4_emergency_state(sb))) + return; + + trace_ext4_error(sb, function, line); + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", + (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + } + + if (test_opt(sb, ERRORS_CONT)) { + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (!bdev_read_only(sb->s_bdev)) { + save_error_info(sb, EFSCORRUPTED, ino, block, function, + line); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); + } + return; + } + ext4_unlock_group(sb, grp); + ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish between the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + +void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t group, + unsigned int flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + int ret; + + if (!grp || !gdp) + return; + if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + if (!ret) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + } + + if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + if (!ret && gdp) { + int count; + + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + } +} + +void ext4_update_dynamic_rev(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) + return; + + ext4_warning(sb, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT4_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) +{ + struct list_head *l; + + ext4_msg(sb, KERN_ERR, "sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +#ifdef CONFIG_QUOTA +static int ext4_quota_off(struct super_block *sb, int type); + +static inline void ext4_quotas_off(struct super_block *sb, int type) +{ + BUG_ON(type > EXT4_MAXQUOTAS); + + /* Use our quota_off function to clear inode flags etc. */ + for (type--; type >= 0; type--) + ext4_quota_off(sb, type); +} + +/* + * This is a helper function which is used in the mount/remount + * codepaths (which holds s_umount) to fetch the quota file name. + */ +static inline char *get_qf_name(struct super_block *sb, + struct ext4_sb_info *sbi, + int type) +{ + return rcu_dereference_protected(sbi->s_qf_names[type], + lockdep_is_held(&sb->s_umount)); +} +#else +static inline void ext4_quotas_off(struct super_block *sb, int type) +{ +} +#endif + +static int ext4_percpu_param_init(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t block; + int err; + + block = ext4_count_free_clusters(sbi->s_sb); + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sbi->s_sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sbi->s_sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); + + if (err) + ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); + + return err; +} + +static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); + percpu_free_rwsem(&sbi->s_writepages_rwsem); +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static void ext4_flex_groups_free(struct ext4_sb_info *sbi) +{ + struct flex_groups **flex_groups; + int i; + + rcu_read_lock(); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } + rcu_read_unlock(); +} + +static void ext4_put_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int aborted = 0; + int err; + + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_sb_upd_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * update_super_work will call start_this_handle may trigger + * BUG_ON. + */ + ext4_unregister_sysfs(sb); + + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) + ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", + &sb->s_uuid); + + ext4_unregister_li_request(sb); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); + + destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); + + if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); + err = ext4_journal_destroy(sbi, sbi->s_journal); + if ((err < 0) && !aborted) { + ext4_abort(sb, -err, "Couldn't clean up the journal"); + } + } else + flush_work(&sbi->s_sb_upd_work); + + ext4_es_unregister_shrinker(sbi); + timer_shutdown_sync(&sbi->s_err_report); + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } + ext4_commit_super(sb); + } + + ext4_group_desc_free(sbi); + ext4_flex_groups_free(sbi); + + WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && + percpu_counter_sum(&sbi->s_dirtyclusters_counter)); + ext4_percpu_param_destroy(sbi); +#ifdef CONFIG_QUOTA + for (int i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(get_qf_name(sb, sbi, i)); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + ASSERT(list_empty(&sbi->s_orphan)); + + sync_blockdev(sb->s_bdev); + invalidate_bdev(sb->s_bdev); + if (sbi->s_journal_bdev_file) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + } + + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + + ext4_stop_mmpd(sbi); + + brelse(sbi->s_sbh); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev, NULL); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); +#endif + kfree(sbi); +} + +static struct kmem_cache *ext4_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext4_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; + spin_lock_init(&ei->i_raw_lock); + ei->i_prealloc_node = RB_ROOT; + atomic_set(&ei->i_prealloc_active, 0); + rwlock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; + ei->i_reserved_data_blocks = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + ei->jinode = NULL; + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + spin_lock_init(&ei->i_completed_io_lock); + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + ext4_fc_init_inode(&ei->vfs_inode); + spin_lock_init(&ei->i_fc_lock); + return &ei->vfs_inode; +} + +static int ext4_drop_inode(struct inode *inode) +{ + int drop = inode_generic_drop(inode); + + if (!drop) + drop = fscrypt_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + +static void ext4_free_in_core_inode(struct inode *inode) +{ + fscrypt_free_inode(inode); + if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { + pr_warn("%s: inode %ld still in fc list", + __func__, inode->i_ino); + } + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + +static void ext4_destroy_inode(struct inode *inode) +{ + if (ext4_inode_orphan_tracked(inode)) { + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): inode tracked as orphan!", + inode->i_ino, EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } + + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && + WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); +} + +static void ext4_shutdown(struct super_block *sb) +{ + ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); +} + +static void init_once(void *foo) +{ + struct ext4_inode_info *ei = foo; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif +} + +static int __init init_inodecache(void) +{ + ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", + sizeof(struct ext4_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + offsetof(struct ext4_inode_info, i_data), + sizeof_field(struct ext4_inode_info, i_data), + init_once); + if (ext4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_inode_cachep); +} + +void ext4_clear_inode(struct inode *inode) +{ + ext4_fc_del(inode); + invalidate_inode_buffers(inode); + clear_inode(inode); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } + fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); +} + +static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + /* + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) + +static int ext4_write_dquot(struct dquot *dquot); +static int ext4_acquire_dquot(struct dquot *dquot); +static int ext4_release_dquot(struct dquot *dquot); +static int ext4_mark_dquot_dirty(struct dquot *dquot); +static int ext4_write_info(struct super_block *sb, int type); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags); + +static struct dquot __rcu **ext4_get_dquots(struct inode *inode) +{ + return EXT4_I(inode)->i_dquot; +} + +static const struct dquot_operations ext4_quota_operations = { + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, + .get_inode_usage = ext4_get_inode_usage, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops ext4_qctl_operations = { + .quota_on = ext4_quota_on, + .quota_off = ext4_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#endif + +static const struct super_operations ext4_sops = { + .alloc_inode = ext4_alloc_inode, + .free_inode = ext4_free_in_core_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .put_super = ext4_put_super, + .sync_fs = ext4_sync_fs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, + .statfs = ext4_statfs, + .show_options = ext4_show_options, + .shutdown = ext4_shutdown, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, + .get_dquots = ext4_get_dquots, +#endif +}; + +static const struct export_operations ext4_export_ops = { + .encode_fh = generic_encode_ino32_fh, + .fh_to_dentry = ext4_fh_to_dentry, + .fh_to_parent = ext4_fh_to_parent, + .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, + Opt_nouid32, Opt_debug, Opt_removed, + Opt_user_xattr, Opt_acl, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_quota, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_prjquota, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, + Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, +#ifdef CONFIG_EXT4_DEBUG + Opt_fc_debug_max_replay, Opt_fc_debug_force +#endif +}; + +static const struct constant_table ext4_param_errors[] = { + {"continue", EXT4_MOUNT_ERRORS_CONT}, + {"panic", EXT4_MOUNT_ERRORS_PANIC}, + {"remount-ro", EXT4_MOUNT_ERRORS_RO}, + {} +}; + +static const struct constant_table ext4_param_data[] = { + {"journal", EXT4_MOUNT_JOURNAL_DATA}, + {"ordered", EXT4_MOUNT_ORDERED_DATA}, + {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, + {} +}; + +static const struct constant_table ext4_param_data_err[] = { + {"abort", Opt_data_err_abort}, + {"ignore", Opt_data_err_ignore}, + {} +}; + +static const struct constant_table ext4_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; + +static const struct constant_table ext4_param_dax[] = { + {"always", Opt_dax_always}, + {"inode", Opt_dax_inode}, + {"never", Opt_dax_never}, + {} +}; + +/* + * Mount option specification + * We don't use fsparam_flag_no because of the way we set the + * options and the way we show them in _ext4_show_options(). To + * keep the changes to a minimum, let's keep the negative options + * separate for now. + */ +static const struct fs_parameter_spec ext4_param_specs[] = { + fsparam_flag ("bsddf", Opt_bsd_df), + fsparam_flag ("minixdf", Opt_minix_df), + fsparam_flag ("grpid", Opt_grpid), + fsparam_flag ("bsdgroups", Opt_grpid), + fsparam_flag ("nogrpid", Opt_nogrpid), + fsparam_flag ("sysvgroups", Opt_nogrpid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), + fsparam_u32 ("sb", Opt_sb), + fsparam_enum ("errors", Opt_errors, ext4_param_errors), + fsparam_flag ("nouid32", Opt_nouid32), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("oldalloc", Opt_removed), + fsparam_flag ("orlov", Opt_removed), + fsparam_flag ("user_xattr", Opt_user_xattr), + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("norecovery", Opt_noload), + fsparam_flag ("noload", Opt_noload), + fsparam_flag ("bh", Opt_removed), + fsparam_flag ("nobh", Opt_removed), + fsparam_u32 ("commit", Opt_commit), + fsparam_u32 ("min_batch_time", Opt_min_batch_time), + fsparam_u32 ("max_batch_time", Opt_max_batch_time), + fsparam_u32 ("journal_dev", Opt_journal_dev), + fsparam_bdev ("journal_path", Opt_journal_path), + fsparam_flag ("journal_checksum", Opt_journal_checksum), + fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), + fsparam_flag ("journal_async_commit",Opt_journal_async_commit), + fsparam_flag ("abort", Opt_abort), + fsparam_enum ("data", Opt_data, ext4_param_data), + fsparam_enum ("data_err", Opt_data_err, + ext4_param_data_err), + fsparam_string_empty + ("usrjquota", Opt_usrjquota), + fsparam_string_empty + ("grpjquota", Opt_grpjquota), + fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_noquota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("prjquota", Opt_prjquota), + fsparam_flag ("barrier", Opt_barrier), + fsparam_u32 ("barrier", Opt_barrier), + fsparam_flag ("nobarrier", Opt_nobarrier), + fsparam_flag ("i_version", Opt_removed), + fsparam_flag ("dax", Opt_dax), + fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), + fsparam_u32 ("stripe", Opt_stripe), + fsparam_flag ("delalloc", Opt_delalloc), + fsparam_flag ("nodelalloc", Opt_nodelalloc), + fsparam_flag ("warn_on_error", Opt_warn_on_error), + fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), + fsparam_u32 ("debug_want_extra_isize", + Opt_debug_want_extra_isize), + fsparam_flag ("mblk_io_submit", Opt_removed), + fsparam_flag ("nomblk_io_submit", Opt_removed), + fsparam_flag ("block_validity", Opt_block_validity), + fsparam_flag ("noblock_validity", Opt_noblock_validity), + fsparam_u32 ("inode_readahead_blks", + Opt_inode_readahead_blks), + fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), + fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), + fsparam_flag ("dioread_nolock", Opt_dioread_nolock), + fsparam_flag ("nodioread_nolock", Opt_dioread_lock), + fsparam_flag ("dioread_lock", Opt_dioread_lock), + fsparam_flag ("discard", Opt_discard), + fsparam_flag ("nodiscard", Opt_nodiscard), + fsparam_u32 ("init_itable", Opt_init_itable), + fsparam_flag ("init_itable", Opt_init_itable), + fsparam_flag ("noinit_itable", Opt_noinit_itable), +#ifdef CONFIG_EXT4_DEBUG + fsparam_flag ("fc_debug_force", Opt_fc_debug_force), + fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), +#endif + fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), + fsparam_flag ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_flag ("inlinecrypt", Opt_inlinecrypt), + fsparam_flag ("nombcache", Opt_nombcache), + fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ + fsparam_flag ("prefetch_block_bitmaps", + Opt_removed), + fsparam_flag ("no_prefetch_block_bitmaps", + Opt_no_prefetch_block_bitmaps), + fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), + fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ + {} +}; + + +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#ifdef CONFIG_QUOTA +#define MOPT_Q 0 +#define MOPT_QFMT 0x0010 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT +#endif +#define MOPT_NO_EXT2 0x0020 +#define MOPT_NO_EXT3 0x0040 +#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_SKIP 0x0080 +#define MOPT_2 0x0100 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, + {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_NO_EXT2}, + {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, + {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_dax_type, 0, MOPT_EXT4_ONLY}, + {Opt_journal_dev, 0, MOPT_NO_EXT2}, + {Opt_journal_path, 0, MOPT_NO_EXT2}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, + {Opt_data, 0, MOPT_NO_EXT2}, + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, +#else + {Opt_acl, 0, MOPT_NOSUPPORT}, +#endif + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_jqfmt, 0, MOPT_QFMT}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, +#ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, +#endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, + {Opt_err, 0, 0} +}; + +#if IS_ENABLED(CONFIG_UNICODE) +static const struct ext4_sb_encodings { + __u16 magic; + char *name; + unsigned int version; +} ext4_sb_encoding_map[] = { + {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, +}; + +static const struct ext4_sb_encodings * +ext4_sb_read_encoding(const struct ext4_super_block *es) +{ + __u16 magic = le16_to_cpu(es->s_encoding); + int i; + + for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) + if (magic == ext4_sb_encoding_map[i].magic) + return &ext4_sb_encoding_map[i]; + + return NULL; +} +#endif + +#define EXT4_SPEC_JQUOTA (1 << 0) +#define EXT4_SPEC_JQFMT (1 << 1) +#define EXT4_SPEC_DATAJ (1 << 2) +#define EXT4_SPEC_SB_BLOCK (1 << 3) +#define EXT4_SPEC_JOURNAL_DEV (1 << 4) +#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) +#define EXT4_SPEC_s_want_extra_isize (1 << 7) +#define EXT4_SPEC_s_max_batch_time (1 << 8) +#define EXT4_SPEC_s_min_batch_time (1 << 9) +#define EXT4_SPEC_s_inode_readahead_blks (1 << 10) +#define EXT4_SPEC_s_li_wait_mult (1 << 11) +#define EXT4_SPEC_s_max_dir_size_kb (1 << 12) +#define EXT4_SPEC_s_stripe (1 << 13) +#define EXT4_SPEC_s_resuid (1 << 14) +#define EXT4_SPEC_s_resgid (1 << 15) +#define EXT4_SPEC_s_commit_interval (1 << 16) +#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) +#define EXT4_SPEC_s_sb_block (1 << 18) +#define EXT4_SPEC_mb_optimize_scan (1 << 19) + +struct ext4_fs_context { + char *s_qf_names[EXT4_MAXQUOTAS]; + struct fscrypt_dummy_policy dummy_enc_policy; + int s_jquota_fmt; /* Format of quota to use */ +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + unsigned short qname_spec; + unsigned long vals_s_flags; /* Bits to set in s_flags */ + unsigned long mask_s_flags; /* Bits changed in s_flags */ + unsigned long journal_devnum; + unsigned long s_commit_interval; + unsigned long s_stripe; + unsigned int s_inode_readahead_blks; + unsigned int s_want_extra_isize; + unsigned int s_li_wait_mult; + unsigned int s_max_dir_size_kb; + unsigned int journal_ioprio; + unsigned int vals_s_mount_opt; + unsigned int mask_s_mount_opt; + unsigned int vals_s_mount_opt2; + unsigned int mask_s_mount_opt2; + unsigned int opt_flags; /* MOPT flags */ + unsigned int spec; + u32 s_max_batch_time; + u32 s_min_batch_time; + kuid_t s_resuid; + kgid_t s_resgid; + ext4_fsblk_t s_sb_block; +}; + +static void ext4_fc_free(struct fs_context *fc) +{ + struct ext4_fs_context *ctx = fc->fs_private; + int i; + + if (!ctx) + return; + + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(ctx->s_qf_names[i]); + + fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); + kfree(ctx); +} + +int ext4_init_fs_context(struct fs_context *fc) +{ + struct ext4_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &ext4_context_ops; + + /* i_version is always enabled now */ + fc->sb_flags |= SB_I_VERSION; + + return 0; +} + +#ifdef CONFIG_QUOTA +/* + * Note the name of the specified quota file. + */ +static int note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + char *qname; + + if (param->size < 1) { + ext4_msg(NULL, KERN_ERR, "Missing quota name"); + return -EINVAL; + } + if (strchr(param->string, '/')) { + ext4_msg(NULL, KERN_ERR, + "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->s_qf_names[qtype]) { + if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { + ext4_msg(NULL, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + return -EINVAL; + } + return 0; + } + + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); + if (!qname) { + ext4_msg(NULL, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -ENOMEM; + } + ctx->s_qf_names[qtype] = qname; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} + +/* + * Clear the name of the specified quota file. + */ +static int unnote_qf_name(struct fs_context *fc, int qtype) +{ + struct ext4_fs_context *ctx = fc->fs_private; + + kfree(ctx->s_qf_names[qtype]); + + ctx->s_qf_names[qtype] = NULL; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} +#endif + +static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, + struct ext4_fs_context *ctx) +{ + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption option not supported"); + return -EINVAL; + } + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->dummy_enc_policy); + if (err == -EINVAL) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", param->key); + } else if (err == -EEXIST) { + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + return err; +} + +#define EXT4_SET_CTX(name) \ +static inline __maybe_unused \ +void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name |= flag; \ +} + +#define EXT4_CLEAR_CTX(name) \ +static inline __maybe_unused \ +void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name &= ~flag; \ +} + +#define EXT4_TEST_CTX(name) \ +static inline unsigned long \ +ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ +{ \ + return (ctx->vals_s_##name & flag); \ +} + +EXT4_SET_CTX(flags); /* set only */ +EXT4_SET_CTX(mount_opt); +EXT4_CLEAR_CTX(mount_opt); +EXT4_TEST_CTX(mount_opt); +EXT4_SET_CTX(mount_opt2); +EXT4_CLEAR_CTX(mount_opt2); +EXT4_TEST_CTX(mount_opt2); + +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + const struct mount_opts *m; + int is_remount; + int token; + + token = fs_parse(fc, ext4_param_specs, param, &result); + if (token < 0) + return token; + is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + + for (m = ext4_mount_opts; m->token != Opt_err; m++) + if (token == m->token) + break; + + ctx->opt_flags |= m->flags; + + if (m->flags & MOPT_EXPLICIT) { + if (m->mount_opt & EXT4_MOUNT_DELALLOC) { + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); + } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { + ctx_set_mount_opt2(ctx, + EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); + } else + return -EINVAL; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(NULL, KERN_ERR, "%s option not supported", + param->key); + return 0; + } + + switch (token) { +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!*param->string) + return unnote_qf_name(fc, USRQUOTA); + else + return note_qf_name(fc, USRQUOTA, param); + case Opt_grpjquota: + if (!*param->string) + return unnote_qf_name(fc, GRPQUOTA); + else + return note_qf_name(fc, GRPQUOTA, param); +#endif + case Opt_sb: + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + ext4_msg(NULL, KERN_WARNING, + "Ignoring %s option on remount", param->key); + } else { + ctx->s_sb_block = result.uint_32; + ctx->spec |= EXT4_SPEC_s_sb_block; + } + return 0; + case Opt_removed: + ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", + param->key); + return 0; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + ctx_set_flags(ctx, SB_INLINECRYPT); +#else + ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); +#endif + return 0; + case Opt_errors: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); + ctx_set_mount_opt(ctx, result.uint_32); + return 0; +#ifdef CONFIG_QUOTA + case Opt_jqfmt: + ctx->s_jquota_fmt = result.uint_32; + ctx->spec |= EXT4_SPEC_JQFMT; + return 0; +#endif + case Opt_data: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + ctx_set_mount_opt(ctx, result.uint_32); + ctx->spec |= EXT4_SPEC_DATAJ; + return 0; + case Opt_commit: + if (result.uint_32 == 0) + result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (result.uint_32 > INT_MAX / HZ) { + ext4_msg(NULL, KERN_ERR, + "Invalid commit interval %d, " + "must be smaller than %d", + result.uint_32, INT_MAX / HZ); + return -EINVAL; + } + ctx->s_commit_interval = HZ * result.uint_32; + ctx->spec |= EXT4_SPEC_s_commit_interval; + return 0; + case Opt_debug_want_extra_isize: + if ((result.uint_32 & 1) || (result.uint_32 < 4)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", result.uint_32); + return -EINVAL; + } + ctx->s_want_extra_isize = result.uint_32; + ctx->spec |= EXT4_SPEC_s_want_extra_isize; + return 0; + case Opt_max_batch_time: + ctx->s_max_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_batch_time; + return 0; + case Opt_min_batch_time: + ctx->s_min_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_min_batch_time; + return 0; + case Opt_inode_readahead_blks: + if (result.uint_32 && + (result.uint_32 > (1 << 30) || + !is_power_of_2(result.uint_32))) { + ext4_msg(NULL, KERN_ERR, + "EXT4-fs: inode_readahead_blks must be " + "0 or a power of 2 smaller than 2^31"); + return -EINVAL; + } + ctx->s_inode_readahead_blks = result.uint_32; + ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; + return 0; + case Opt_init_itable: + ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); + ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (param->type == fs_value_is_string) + ctx->s_li_wait_mult = result.uint_32; + ctx->spec |= EXT4_SPEC_s_li_wait_mult; + return 0; + case Opt_max_dir_size_kb: + ctx->s_max_dir_size_kb = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; + return 0; +#ifdef CONFIG_EXT4_DEBUG + case Opt_fc_debug_max_replay: + ctx->s_fc_debug_max_replay = result.uint_32; + ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; + return 0; +#endif + case Opt_stripe: + ctx->s_stripe = result.uint_32; + ctx->spec |= EXT4_SPEC_s_stripe; + return 0; + case Opt_resuid: + ctx->s_resuid = result.uid; + ctx->spec |= EXT4_SPEC_s_resuid; + return 0; + case Opt_resgid: + ctx->s_resgid = result.gid; + ctx->spec |= EXT4_SPEC_s_resgid; + return 0; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(NULL, KERN_ERR, + "Cannot specify journal on remount"); + return -EINVAL; + } + ctx->journal_devnum = result.uint_32; + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; + return 0; + case Opt_journal_path: + { + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(NULL, KERN_ERR, + "Cannot specify journal on remount"); + return -EINVAL; + } + + error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(NULL, KERN_ERR, "error: could not find " + "journal device path"); + return -EINVAL; + } + + journal_inode = d_inode(path.dentry); + ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; + path_put(&path); + return 0; + } + case Opt_journal_ioprio: + if (result.uint_32 > 7) { + ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" + " (must be 0-7)"); + return -EINVAL; + } + ctx->journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); + ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; + return 0; + case Opt_test_dummy_encryption: + return ext4_parse_test_dummy_encryption(param, ctx); + case Opt_dax: + case Opt_dax_type: +#ifdef CONFIG_FS_DAX + { + int type = (token == Opt_dax) ? + Opt_dax : result.uint_32; + + switch (type) { + case Opt_dax: + case Opt_dax_always: + ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); + break; + case Opt_dax_never: + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + break; + case Opt_dax_inode: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); + /* Strictly for printing options */ + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); + break; + } + return 0; + } +#else + ext4_msg(NULL, KERN_INFO, "dax option not supported"); + return -EINVAL; +#endif + case Opt_data_err: + if (result.uint_32 == Opt_data_err_abort) + ctx_set_mount_opt(ctx, m->mount_opt); + else if (result.uint_32 == Opt_data_err_ignore) + ctx_clear_mount_opt(ctx, m->mount_opt); + return 0; + case Opt_mb_optimize_scan: + if (result.int_32 == 1) { + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); + ctx->spec |= EXT4_SPEC_mb_optimize_scan; + } else if (result.int_32 == 0) { + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); + ctx->spec |= EXT4_SPEC_mb_optimize_scan; + } else { + ext4_msg(NULL, KERN_WARNING, + "mb_optimize_scan should be set to 0 or 1."); + return -EINVAL; + } + return 0; + } + + /* + * At this point we should only be getting options requiring MOPT_SET, + * or MOPT_CLEAR. Anything else is a bug + */ + if (m->token == Opt_err) { + ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", + param->key); + WARN_ON(1); + return -EINVAL; + } + + else { + unsigned int set = 0; + + if ((param->type == fs_value_is_flag) || + result.uint_32 > 0) + set = 1; + + if (m->flags & MOPT_CLEAR) + set = !set; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(NULL, KERN_WARNING, + "buggy handling of option %s", + param->key); + WARN_ON(1); + return -EINVAL; + } + if (m->flags & MOPT_2) { + if (set != 0) + ctx_set_mount_opt2(ctx, m->mount_opt); + else + ctx_clear_mount_opt2(ctx, m->mount_opt); + } else { + if (set != 0) + ctx_set_mount_opt(ctx, m->mount_opt); + else + ctx_clear_mount_opt(ctx, m->mount_opt); + } + } + + return 0; +} + +static int parse_options(struct fs_context *fc, char *options) +{ + struct fs_parameter param; + int ret; + char *key; + + if (!options) + return 0; + + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + param.type = fs_value_is_flag; + param.string = NULL; + + if (value) { + if (value == key) + continue; + + *value++ = 0; + v_len = strlen(value); + param.string = kmemdup_nul(value, v_len, + GFP_KERNEL); + if (!param.string) + return -ENOMEM; + param.type = fs_value_is_string; + } + + param.key = key; + param.size = v_len; + + ret = ext4_parse_param(fc, ¶m); + kfree(param.string); + if (ret < 0) + return ret; + } + } + + ret = ext4_validate_options(fc); + if (ret < 0) + return ret; + + return 0; +} + +static int parse_apply_sb_mount_options(struct super_block *sb, + struct ext4_fs_context *m_ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char s_mount_opts[65]; + struct ext4_fs_context *s_ctx = NULL; + struct fs_context *fc = NULL; + int ret = -ENOMEM; + + if (!sbi->s_es->s_mount_opts[0]) + return 0; + + strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!s_ctx) + goto out_free; + + fc->fs_private = s_ctx; + fc->s_fs_info = sbi; + + ret = parse_options(fc, s_mount_opts); + if (ret < 0) + goto parse_failed; + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) { +parse_failed: + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + ret = 0; + goto out_free; + } + + if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) + m_ctx->journal_devnum = s_ctx->journal_devnum; + if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) + m_ctx->journal_ioprio = s_ctx->journal_ioprio; + + ext4_apply_options(fc, sb); + ret = 0; + +out_free: + ext4_fc_free(fc); + kfree(fc); + return ret; +} + +static void ext4_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + bool quota_feature = ext4_has_feature_quota(sb); + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + int i; + + if (quota_feature) + return; + + if (ctx->spec & EXT4_SPEC_JQUOTA) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + qname = ctx->s_qf_names[i]; /* May be NULL */ + if (qname) + set_opt(sb, QUOTA); + ctx->s_qf_names[i] = NULL; + qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, + lockdep_is_held(&sb->s_umount)); + if (qname) + kfree_rcu_mightsleep(qname); + } + } + + if (ctx->spec & EXT4_SPEC_JQFMT) + sbi->s_jquota_fmt = ctx->s_jquota_fmt; +#endif +} + +/* + * Check quota settings consistency. + */ +static int ext4_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + bool quota_feature = ext4_has_feature_quota(sb); + bool quota_loaded = sb_any_quota_loaded(sb); + bool usr_qf_name, grp_qf_name, usrquota, grpquota; + int quota_flags, i; + + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && + !ext4_has_feature_project(sb)) { + ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -EINVAL; + } + + quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; + if (quota_loaded && + ctx->mask_s_mount_opt & quota_flags && + !ctx_test_mount_opt(ctx, quota_flags)) + goto err_quota_change; + + if (ctx->spec & EXT4_SPEC_JQUOTA) { + + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + if (quota_loaded && + !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) + goto err_jquota_change; + + if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && + strcmp(get_qf_name(sb, sbi, i), + ctx->s_qf_names[i]) != 0) + goto err_jquota_specified; + } + + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, + "Journaled quota options ignored when " + "QUOTA feature is enabled"); + return 0; + } + } + + if (ctx->spec & EXT4_SPEC_JQFMT) { + if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) + goto err_jquota_change; + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, "Quota format mount options " + "ignored when QUOTA feature is enabled"); + return 0; + } + } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || + ctx->s_qf_names[USRQUOTA]); + grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || + ctx->s_qf_names[GRPQUOTA]); + + usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + test_opt(sb, USRQUOTA)); + + grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || + test_opt(sb, GRPQUOTA)); + + if (usr_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + grpquota = false; + } + + if (usr_qf_name || grp_qf_name) { + if (usrquota || grpquota) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + + if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { + ext4_msg(NULL, KERN_ERR, "journaled quota format " + "not specified"); + return -EINVAL; + } + } + + return 0; + +err_quota_change: + ext4_msg(NULL, KERN_ERR, + "Cannot change quota options when quota turned on"); + return -EINVAL; +err_jquota_change: + ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " + "options when quota turned on"); + return -EINVAL; +err_jquota_specified: + ext4_msg(NULL, KERN_ERR, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; +#else + return 0; +#endif +} + +static int ext4_check_test_dummy_encryption(const struct fs_context *fc, + struct super_block *sb) +{ + const struct ext4_fs_context *ctx = fc->fs_private; + const struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) + return 0; + + if (!ext4_has_feature_encrypt(sb)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption requires encrypt feature"); + return -EINVAL; + } + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; + ext4_msg(NULL, KERN_WARNING, + "Can't set or change test_dummy_encryption on remount"); + return -EINVAL; + } + /* Also make sure s_mount_opts didn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + return 0; +} + +static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, + struct super_block *sb) +{ + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) + return; + EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; + memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +} + +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + int err; + + if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext2"); + return -EINVAL; + } + if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext3"); + return -EINVAL; + } + + if (ctx->s_want_extra_isize > + (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", + ctx->s_want_extra_isize); + return -EINVAL; + } + + err = ext4_check_test_dummy_encryption(fc, sb); + if (err) + return err; + + if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { + if (!sbi->s_journal) { + ext4_msg(NULL, KERN_WARNING, + "Remounting file system with no journal " + "so ignoring journalled data option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != + test_opt(sb, DATA_FLAGS)) { + ext4_msg(NULL, KERN_ERR, "Cannot change data mode " + "on remount"); + return -EINVAL; + } + } + + if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(NULL, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { +fail_dax_change_remount: + ext4_msg(NULL, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -EINVAL; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { + goto fail_dax_change_remount; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { + goto fail_dax_change_remount; + } + } + + return ext4_check_quota_consistency(fc, sb); +} + +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + + sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; + sbi->s_mount_opt |= ctx->vals_s_mount_opt; + sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; + sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; + sb->s_flags &= ~ctx->mask_s_flags; + sb->s_flags |= ctx->vals_s_flags; + +#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) + APPLY(s_commit_interval); + APPLY(s_stripe); + APPLY(s_max_batch_time); + APPLY(s_min_batch_time); + APPLY(s_want_extra_isize); + APPLY(s_inode_readahead_blks); + APPLY(s_max_dir_size_kb); + APPLY(s_li_wait_mult); + APPLY(s_resgid); + APPLY(s_resuid); + +#ifdef CONFIG_EXT4_DEBUG + APPLY(s_fc_debug_max_replay); +#endif + + ext4_apply_quota_options(fc, sb); + ext4_apply_test_dummy_encryption(ctx, sb); +} + + +static int ext4_validate_options(struct fs_context *fc) +{ +#ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + char *usr_qf_name, *grp_qf_name; + + usr_qf_name = ctx->s_qf_names[USRQUOTA]; + grp_qf_name = ctx->s_qf_names[GRPQUOTA]; + + if (usr_qf_name || grp_qf_name) { + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + } +#endif + return 1; +} + +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *usr_qf_name, *grp_qf_name; + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + rcu_read_lock(); + usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); + grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); + if (usr_qf_name) + seq_show_option(seq, "usrjquota", usr_qf_name); + if (grp_qf_name) + seq_show_option(seq, "grpjquota", grp_qf_name); + rcu_read_unlock(); +#endif +} + +static const char *token2str(int token) +{ + const struct fs_parameter_spec *spec; + + for (spec = ext4_param_specs; spec->name != NULL; spec++) + if (spec->opt == token && !spec->type) + break; + return spec->name; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + int opt_2 = m->flags & MOPT_2; + unsigned int mount_opt, def_mount_opt; + + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + m->flags & MOPT_SKIP) + continue; + + if (opt_2) { + mount_opt = sbi->s_mount_opt2; + def_mount_opt = sbi->s_def_mount_opt2; + } else { + mount_opt = sbi->s_mount_opt; + def_mount_opt = sbi->s_def_mount_opt; + } + /* skip if same as the default */ + if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) + continue; + /* select Opt_noFoo vs Opt_Foo */ + if ((want_set && + (mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (mount_opt & m->mount_opt))) + continue; + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || + ext4_get_resuid(es) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || + ext4_get_resgid(es) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (nodefs && sb->s_flags & SB_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (nodefs || EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + if (test_opt(sb, DATA_ERR_ABORT)) + SEQ_OPTS_PUTS("data_err=abort"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); + + if (sb->s_flags & SB_INLINECRYPT) + SEQ_OPTS_PUTS("inlinecrypt"); + + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } + + if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && + !test_opt2(sb, MB_OPTIMIZE_SCAN)) { + SEQ_OPTS_PUTS("mb_optimize_scan=0"); + } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && + test_opt2(sb, MB_OPTIMIZE_SCAN)) { + SEQ_OPTS_PUTS("mb_optimize_scan=1"); + } + + if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) + SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +int ext4_seq_options_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_putc(seq, '\n'); + return rc; +} + +static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, + int read_only) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { + ext4_msg(sb, KERN_ERR, "revision level too high, " + "forcing read-only mode"); + err = -EROFS; + goto done; + } + if (read_only) + goto done; + if (!(sbi->s_mount_state & EXT4_VALID_FS)) + ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if (sbi->s_mount_state & EXT4_ERROR_FS) + ext4_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext4_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (ext4_get_tstamp(es, s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) + ext4_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + ext4_update_tstamp(es, s_mtime); + if (sbi->s_journal) { + ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } + + err = ext4_commit_super(sb); +done: + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT4_BLOCKS_PER_GROUP(sb), + EXT4_INODES_PER_GROUP(sb), + sbi->s_mount_opt, sbi->s_mount_opt2); + return err; +} + +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups **old_groups, **new_groups; + int size, i, j; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + new_groups = kvzalloc(roundup_pow_of_two(size * + sizeof(*sbi->s_flex_groups)), GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, + "not enough memory for %d flex group pointers", size); + return -ENOMEM; + } + for (i = sbi->s_flex_groups_allocated; i < size; i++) { + new_groups[i] = kvzalloc(roundup_pow_of_two( + sizeof(struct flex_groups)), + GFP_KERNEL); + if (!new_groups[i]) { + for (j = sbi->s_flex_groups_allocated; j < i; j++) + kvfree(new_groups[j]); + kvfree(new_groups); + ext4_msg(sb, KERN_ERR, + "not enough memory for %d flex groups", size); + return -ENOMEM; + } + } + rcu_read_lock(); + old_groups = rcu_dereference(sbi->s_flex_groups); + if (old_groups) + memcpy(new_groups, old_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups *))); + rcu_read_unlock(); + rcu_assign_pointer(sbi->s_flex_groups, new_groups); + sbi->s_flex_groups_allocated = size; + if (old_groups) + ext4_kvfree_array_rcu(old_groups); + return 0; +} + +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct flex_groups *fg; + ext4_group_t flex_group; + int i, err; + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) + goto failed; + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); + fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); + atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &fg->free_clusters); + atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); + } + + return 1; +failed: + return 0; +} + +static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + int offset = offsetof(struct ext4_group_desc, bg_checksum); + __u16 crc = 0; + __le32 le_group = cpu_to_le32(block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { + /* Use new metadata_csum algorithm */ + __u32 csum32; + __u16 dummy_csum = 0; + + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, + sizeof(le_group)); + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, + sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + if (offset < sbi->s_desc_size) + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); + + crc = csum32 & 0xFFFF; + goto out; + } + + /* old crc16 code */ + if (!ext4_has_feature_gdt_csum(sb)) + return 0; + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) + crc = crc16(crc, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); + +out: + return cpu_to_le16(crc); +} + +int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) + return 0; + + return 1; +} + +void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (!ext4_has_group_desc_csum(sb)) + return; + gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); +} + +/* Called at mount-time, super-block is locked */ +static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext4_fsblk_t last_block; + ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); + ext4_fsblk_t block_bitmap; + ext4_fsblk_t inode_bitmap; + ext4_fsblk_t inode_table; + int flexbg_flag = 0; + ext4_group_t i, grp = sbi->s_groups_count; + + if (ext4_has_feature_flex_bg(sb)) + flexbg_flag = 1; + + ext4_debug("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + + if (i == sbi->s_groups_count - 1 || flexbg_flag) + last_block = ext4_blocks_count(sbi->s_es) - 1; + else + last_block = first_block + + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + + block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + if (!sb_rdonly(sb)) + return 0; + } + if (block_bitmap >= sb_block + 1 && + block_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } + if (block_bitmap < first_block || block_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u not in group " + "(block %llu)!", i, block_bitmap); + return 0; + } + inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + if (!sb_rdonly(sb)) + return 0; + } + if (inode_bitmap >= sb_block + 1 && + inode_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } + if (inode_bitmap < first_block || inode_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u not in group " + "(block %llu)!", i, inode_bitmap); + return 0; + } + inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + if (!sb_rdonly(sb)) + return 0; + } + if (inode_table >= sb_block + 1 && + inode_table <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } + if (inode_table < first_block || + inode_table + sbi->s_itb_per_group - 1 > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u not in group " + "(block %llu)!", i, inode_table); + return 0; + } + ext4_lock_group(sb, i); + if (!ext4_group_desc_csum_verify(sb, i, gdp)) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Checksum for group %u failed (%u!=%u)", + i, le16_to_cpu(ext4_group_desc_csum(sb, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); + if (!sb_rdonly(sb)) { + ext4_unlock_group(sb, i); + return 0; + } + } + ext4_unlock_group(sb, i); + if (!flexbg_flag) + first_block += EXT4_BLOCKS_PER_GROUP(sb); + } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; + return 1; +} + +/* + * Maximal extent format file size. + * Resulting logical blkno at s_maxbytes must fit in our on-disk + * extent format containers, within a sector_t, and within i_blocks + * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, + * so that won't be a limiting factor. + * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * + * Note, this does *not* consider any metadata overhead for vfs i_blocks. + */ +static loff_t ext4_max_size(int blkbits, int has_huge_files) +{ + loff_t res; + loff_t upper_limit = MAX_LFS_FILESIZE; + + BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); + + if (!has_huge_files) { + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (blkbits - 9); + upper_limit <<= blkbits; + } + + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; + res <<= blkbits; + + /* Sanity check against vm- & vfs- imposed limits */ + if (res > upper_limit) + res = upper_limit; + + return res; +} + +/* + * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^48 sector limit. + */ +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) +{ + loff_t upper_limit, res = EXT4_NDIR_BLOCKS; + int meta_blocks; + unsigned int ppb = 1 << (bits - 2); + + /* + * This is calculated to be the largest file size for a dense, block + * mapped file such that the file's total number of 512-byte sectors, + * including data and all indirect blocks, does not exceed (2^48 - 1). + * + * __u32 i_blocks_lo and _u16 i_blocks_high represent the total + * number of 512-byte sectors of the file. + */ + if (!has_huge_files) { + /* + * !has_huge_files or implies that the inode i_block field + * represents total file blocks in 2^32 512-byte sectors == + * size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + } else { + /* + * We use 48 bit ext4_inode i_blocks + * With EXT4_HUGE_FILE_FL set the i_blocks + * represent total number of blocks in + * file system block size + */ + upper_limit = (1LL << 48) - 1; + + } + + /* Compute how many blocks we can address by block tree */ + res += ppb; + res += ppb * ppb; + res += ((loff_t)ppb) * ppb * ppb; + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; + /* Does block tree limit file size? */ + if (res + meta_blocks <= upper_limit) + goto check_lfs; + + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT4_NDIR_BLOCKS; + /* indirect blocks */ + meta_blocks = 1; + upper_limit -= ppb; + /* double indirect blocks */ + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: + res <<= bits; + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext4_fsblk_t descriptor_loc(struct super_block *sb, + ext4_fsblk_t logical_sb_block, int nr) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) + return logical_sb_block + nr + 1; + bg = sbi->s_desc_per_block * nr; + if (ext4_bg_has_super(sb, bg)) + has_super = 1; + + /* + * If we have a meta_bg fs with 1k blocks, group 0's GDT is at + * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled + * on modern mke2fs or blksize > 1k on older mke2fs) then we must + * compensate. + */ + if (sb->s_blocksize == 1024 && nr == 0 && + le32_to_cpu(sbi->s_es->s_first_data_block) == 0) + has_super++; + + return (has_super + ext4_group_first_block_no(sb, bg)); +} + +/** + * ext4_get_stripe_size: Get the stripe size. + * @sbi: In memory super block info + * + * If we have specified it via mount option, then + * use the mount option value. If the value specified at mount time is + * greater than the blocks per group use the super block value. + * If the super block value is greater than blocks per group return 0. + * Allocator needs it be less than blocks per group. + * + */ +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) +{ + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); + unsigned long stripe_width = + le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; + + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) + ret = sbi->s_stripe; + else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride && stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; + + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; + + return ret; +} + +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (ext4_has_unknown_ext4_incompat_features(sb)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { + ext4_msg(sb, KERN_ERR, + "Filesystem with casefold feature cannot be " + "mounted without CONFIG_UNICODE"); + return 0; + } + + if (readonly) + return 1; + + if (ext4_has_feature_readonly(sb)) { + ext4_msg(sb, KERN_INFO, "filesystem is read-only"); + sb->s_flags |= SB_RDONLY; + return 1; + } + + /* Check that feature set is OK for a read-write mount */ + if (ext4_has_unknown_ext4_ro_compat_features(sb)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } + +#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) + if (!readonly && (ext4_has_feature_quota(sb) || + ext4_has_feature_project(sb))) { + ext4_msg(sb, KERN_ERR, + "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); + return 0; + } +#endif /* CONFIG_QUOTA */ + return 1; +} + +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(struct timer_list *t) +{ + struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); + struct super_block *sb = sbi->s_sb; + struct ext4_super_block *es = sbi->s_es; + + if (es->s_error_count) + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", + sb->s_id, + ext4_get_tstamp(es, s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(KERN_CONT ": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(KERN_CONT ": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk(KERN_CONT "\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", + sb->s_id, + ext4_get_tstamp(es, s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(KERN_CONT ": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(KERN_CONT ": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk(KERN_CONT "\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + struct super_block *sb = elr->lr_super; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t group = elr->lr_next_group; + unsigned int prefetch_ios = 0; + int ret = 0; + int nr = EXT4_SB(sb)->s_mb_prefetch; + u64 start_time; + + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { + elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); + ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { + elr->lr_next_group = elr->lr_first_not_zeroed; + elr->lr_mode = EXT4_LI_MODE_ITABLE; + ret = 0; + } + } + return ret; + } + + for (; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group >= ngroups) + ret = 1; + + if (!ret) { + start_time = ktime_get_ns(); + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + trace_ext4_lazy_itable_init(sb, group); + if (elr->lr_timeout == 0) { + elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * + EXT4_SB(elr->lr_super)->s_li_wait_mult); + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request structure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + if (!elr) + return; + + list_del(&elr->lr_request); + EXT4_SB(elr->lr_super)->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); + return; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); + mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); +} + +static struct task_struct *ext4_lazyinit_task; + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup, cur; + + BUG_ON(NULL == eli); + set_freezable(); + +cont_thread: + while (true) { + bool next_wakeup_initialized = false; + + next_wakeup = 0; + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + list_for_each_safe(pos, n, &eli->li_request_list) { + int err = 0; + int progress = 0; + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_before(jiffies, elr->lr_next_sched)) { + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { + next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* + * We hold sb->s_umount, sb can not + * be removed from the list, it is + * now safe to drop li_list_mtx + */ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); + n = pos->next; + } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + continue; + } + if (!progress) { + elr->lr_next_sched = jiffies + + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); + } + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { + next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } + } + mutex_unlock(&eli->li_list_mtx); + + try_to_freeze(); + + cur = jiffies; + if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { + cond_resched(); + continue; + } + + schedule_timeout_interruptible(next_wakeup - cur); + + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); + ext4_clear_request_list(); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + if (!ext4_has_group_desc_csum(sb)) + return ngroups; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_li_request *elr; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_first_not_zeroed = start; + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; + } else { + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + } + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); + return elr; +} + +int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr = NULL; + ext4_group_t ngroups = sbi->s_groups_count; + int ret = 0; + + mutex_lock(&ext4_li_mtx); + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; + goto out; + } + + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) + goto out; + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) { + ret = -ENOMEM; + goto out; + } + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + /* + * set elr to NULL here since it has been inserted to + * the request_list and the removal and free of it is + * handled by ext4_clear_request_list from now on. + */ + elr = NULL; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info || !ext4_lazyinit_task) + return; + + kthread_stop(ext4_lazyinit_task); +} + +static int set_journal_csum_feature_set(struct super_block *sb) +{ + int ret = 1; + int compat, incompat; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (ext4_has_feature_metadata_csum(sb)) { + /* journal checksum v3 */ + compat = 0; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; + } else { + /* journal checksum v1 */ + compat = JBD2_FEATURE_COMPAT_CHECKSUM; + incompat = 0; + } + + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_CSUM_V3 | + JBD2_FEATURE_INCOMPAT_CSUM_V2); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + incompat); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + incompat); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + + return ret; +} + +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ +static int count_overhead(struct super_block *sb, ext4_group_t grp, + char *buf) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_fsblk_t first_block, last_block, b; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); + + if (!ext4_has_feature_bigalloc(sb)) + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + + sbi->s_itb_per_group + 2); + + first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + + (grp * EXT4_BLOCKS_PER_GROUP(sb)); + last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + b = ext4_block_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_table(sb, gdp); + if (b >= first_block && b + sbi->s_itb_per_group <= last_block) + for (j = 0; j < sbi->s_itb_per_group; j++, b++) { + int c = EXT4_B2C(sbi, b - first_block); + ext4_set_bit(c, buf); + count++; + } + if (i != grp) + continue; + s = 0; + if (ext4_bg_has_super(sb, grp)) { + ext4_set_bit(s++, buf); + count++; + } + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; + } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); + } + if (!count) + return 0; + return EXT4_CLUSTERS_PER_GROUP(sb) - + ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); +} + +/* + * Compute the overhead and stash it in sbi->s_overhead + */ +int ext4_calculate_overhead(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct inode *j_inode; + unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + ext4_fsblk_t overhead = 0; + char *buf = (char *) get_zeroed_page(GFP_NOFS); + + if (!buf) + return -ENOMEM; + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are overhead + */ + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); + + /* + * Add the overhead found in each block group + */ + for (i = 0; i < ngroups; i++) { + int blks; + + blks = count_overhead(sb, i, buf); + overhead += blks; + if (blks) + memset(buf, 0, PAGE_SIZE); + cond_resched(); + } + + /* + * Add the internal journal blocks whether the journal has been + * loaded or not + */ + if (sbi->s_journal && !sbi->s_journal_bdev_file) + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); + else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { + /* j_inum for internal journal is non-zero */ + j_inode = ext4_get_journal_inode(sb, j_inum); + if (!IS_ERR(j_inode)) { + j_blocks = j_inode->i_size >> sb->s_blocksize_bits; + overhead += EXT4_NUM_B2C(sbi, j_blocks); + iput(j_inode); + } else { + ext4_msg(sb, KERN_ERR, "can't get journal size"); + } + } + sbi->s_overhead = overhead; + smp_wmb(); + free_page((unsigned long) buf); + return 0; +} + +static void ext4_set_resv_clusters(struct super_block *sb) +{ + ext4_fsblk_t resv_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!ext4_has_feature_extents(sb)) + return; + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * unwritten extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + atomic64_set(&sbi->s_resv_clusters, resv_clusters); +} + +static const char *ext4_quota_mode(struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + if (!ext4_quota_capable(sb)) + return "none"; + + if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) + return "journalled"; + else + return "writeback"; +#else + return "disabled"; +#endif +} + +static void ext4_setup_csum_trigger(struct super_block *sb, + enum ext4_journal_trigger_type type, + void (*trigger)( + struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh, + void *mapped_data, + size_t size)) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_journal_triggers[type].sb = sb; + sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; +} + +static void ext4_free_sbi(struct ext4_sb_info *sbi) +{ + if (!sbi) + return; + + kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev, NULL); + kfree(sbi); +} + +static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return NULL; + + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + + if (!sbi->s_blockgroup_lock) + goto err_out; + + sb->s_fs_info = sbi; + sbi->s_sb = sb; + return sbi; +err_out: + fs_put_dax(sbi->s_daxdev, NULL); + kfree(sbi); + return NULL; +} + +static void ext4_set_def_opts(struct super_block *sb, + struct ext4_super_block *es) +{ + unsigned long def_mount_opts; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); + if (def_mount_opts & EXT4_DEFM_DEBUG) + set_opt(sb, DEBUG); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + set_opt(sb, GRPID); + if (def_mount_opts & EXT4_DEFM_UID16) + set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ + set_opt(sb, XATTR_USER); +#ifdef CONFIG_EXT4_FS_POSIX_ACL + set_opt(sb, POSIX_ACL); +#endif + if (ext4_has_feature_fast_commit(sb)) + set_opt2(sb, JOURNAL_FAST_COMMIT); + /* don't forget to enable journal_csum when metadata_csum is enabled. */ + if (ext4_has_feature_metadata_csum(sb)) + set_opt(sb, JOURNAL_CHECKSUM); + + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) + set_opt(sb, JOURNAL_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) + set_opt(sb, ORDERED_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) + set_opt(sb, WRITEBACK_DATA); + + if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) + set_opt(sb, ERRORS_PANIC); + else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) + set_opt(sb, ERRORS_CONT); + else + set_opt(sb, ERRORS_RO); + /* block_validity enabled by default; disable with noblock_validity */ + set_opt(sb, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sb, DISCARD); + + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sb, BARRIER); + + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) + set_opt(sb, DELALLOC); + + if (sb->s_blocksize <= PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); +} + +static int ext4_handle_clustersize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int clustersize; + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + if (ext4_has_feature_bigalloc(sb)) { + if (clustersize < sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + } else { + if (clustersize != sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + return -EINVAL; + } + sbi->s_cluster_bits = 0; + } + sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, + "blocks per group (%lu) and clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, sbi->s_clusters_per_group); + return -EINVAL; + } + sbi->s_cluster_ratio = clustersize / sb->s_blocksize; + + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); + + return 0; +} + +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. + * @sb: super block + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(clustersize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + +static void ext4_fast_commit_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + sbi->s_fc_ineligible_tid = 0; + mutex_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; +} + +static int ext4_inode_info_init(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + return -EINVAL; + } + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > sb->s_blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); + return -EINVAL; + } + /* + * i_atime_extra is the last extra field available for + * [acm]times in struct ext4_inode. Checking for that + * field should suffice to ensure we have extra space + * for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + sb->s_time_min = EXT4_TIMESTAMP_MIN; + } + + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + unsigned v, max = (sbi->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE); + + v = le16_to_cpu(es->s_want_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_want_extra_isize: %d", v); + return -EINVAL; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + + v = le16_to_cpu(es->s_min_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_min_extra_isize: %d", v); + return -EINVAL; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + } + } + + return 0; +} + +#if IS_ENABLED(CONFIG_UNICODE) +static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + const struct ext4_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); + + if (!ext4_has_feature_casefold(sb) || sb->s_encoding) + return 0; + + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } + + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + ext4_msg(sb, KERN_ERR, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return -EINVAL; + } + ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; + + return 0; +} +#else +static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + return 0; +} +#endif + +static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Warn if metadata_csum and gdt_csum are both set. */ + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) + ext4_warning(sb, "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + return -EINVAL; + } + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); + + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + return -EFSBADCRC; + } + + /* Precompute checksum seed for all metadata */ + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, + sizeof(es->s_uuid)); + return 0; +} + +static int ext4_check_feature_compatibility(struct super_block *sb, + struct ext4_super_block *es, + int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && + (ext4_has_compat_features(sb) || + ext4_has_ro_compat_features(sb) || + ext4_has_incompat_features(sb))) + ext4_msg(sb, KERN_WARNING, + "feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (ext4_has_feature_64bit(sb)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + return -EINVAL; + } + + /* + * ea_inode feature uses l_i_version field which is not + * available in HURD_COMPAT mode. + */ + if (ext4_has_feature_ea_inode(sb)) { + ext4_msg(sb, KERN_ERR, + "ea_inode feature is not supported for Hurd"); + return -EINVAL; + } + } + + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext[34] filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + return -EINVAL; + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + return -EINVAL; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext4 filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + return -EINVAL; + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + return -EINVAL; + } + } + + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) + return -EINVAL; + + if (sbi->s_daxdev) { + if (sb->s_blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + return -EINVAL; + } + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device."); + return -EINVAL; + } + } + + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + return -EINVAL; + } + + return 0; +} + +static int ext4_check_geometry(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + __u64 blocks_count; + int err; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + return -EINVAL; + } + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + return err; + } + + /* check blocks count against device size */ + blocks_count = sb_bdev_nr_blocks(sb); + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + return -EINVAL; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + return -EINVAL; + } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + return -EINVAL; + } + + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " + "(block count %llu, first data block %u, " + "blocks per group %lu)", blocks_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + return -EINVAL; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + return -EINVAL; + } + + return 0; +} + +static int ext4_group_desc_init(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t logical_sb_block, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int db_count; + ext4_fsblk_t block; + int i; + + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + return -EINVAL; + } + } + rcu_assign_pointer(sbi->s_group_desc, + kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL)); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + return -ENOMEM; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + ext4_sb_breadahead_unmovable(sb, block); + } + + for (i = 0; i < db_count; i++) { + struct buffer_head *bh; + + block = descriptor_loc(sb, logical_sb_block, i); + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + sbi->s_gdb_count = i; + return PTR_ERR(bh); + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; + rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + return -EFSCORRUPTED; + } + + return 0; +} + +static int ext4_load_and_init_journal(struct super_block *sb, + struct ext4_super_block *es, + struct ext4_fs_context *ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + err = ext4_load_journal(sb, es, ctx->journal_devnum); + if (err) + return err; + + if (ext4_has_feature_64bit(sb) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto out; + } + + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto out; + } + + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto out; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + set_opt(sb, ORDERED_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { + set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto out; + } + break; + default: + break; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto out; + } + + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); + + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; + + return 0; + +out: + ext4_journal_destroy(sbi, sbi->s_journal); + return -EINVAL; +} + +static int ext4_check_journal_data_mode(struct super_block *sb) +{ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " + "data=journal disables delayed allocation, " + "dioread_nolock, O_DIRECT and fast_commit support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ + clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + return -EINVAL; + } + if (test_opt(sb, DAX_ALWAYS)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; + } + + return 0; +} + +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + +static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, + int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es; + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + struct buffer_head *bh; + int ret = -EINVAL; + int blocksize; + + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + return -EINVAL; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sbi->s_sb_block; + } + + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + return PTR_ERR(bh); + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto out; + } + + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto out; + } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto out; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + /* + * If the default block size is not the same as the real block size, + * we need to reload it. + */ + if (sb->s_blocksize == blocksize) { + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; + } + + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + bh = NULL; + goto out; + } + + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; + goto out; + } + es = (struct ext4_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); + goto out; + } + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; +out: + brelse(bh); + return ret; +} + +static int ext4_hash_info_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + unsigned int i; + + sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_def_hash_version > DX_HASH_LAST) { + ext4_msg(sb, KERN_ERR, + "Invalid default hash set in the superblock"); + return -EINVAL; + } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { + ext4_msg(sb, KERN_ERR, + "SIPHASH is not a valid default hash value"); + return -EINVAL; + } + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + + if (ext4_has_feature_dir_index(sb)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + } + return 0; +} + +static int ext4_block_group_meta_init(struct super_block *sb, int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int has_huge_files; + + has_huge_files = ext4_has_feature_huge_file(sb); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (ext4_has_feature_64bit(sb)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + return -EINVAL; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + return -EINVAL; + } + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_inodes_per_group); + return -EINVAL; + } + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + return 0; +} + +/* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simplify code and avoid + * stripe aligned allocation which will rarely succeed. + */ +static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + return (stripe > 0 && sbi->s_cluster_ratio > 1 && + stripe % sbi->s_cluster_ratio != 0); +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t logical_sb_block; + struct inode *root; + int needs_recovery; + int err; + ext4_group_t first_not_zeroed; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; + + /* Set defaults for the variables that will be set during parsing */ + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); + + err = ext4_load_super(sb, &logical_sb_block, silent); + if (err) + goto out_fail; + + es = sbi->s_es; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + err = ext4_init_metadata_csum(sb, es); + if (err) + goto failed_mount; + + ext4_set_def_opts(sb, es); + + sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es)); + sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es)); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + err = ext4_inode_info_init(sb, es); + if (err) + goto failed_mount; + + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + + sbi->s_def_mount_opt = sbi->s_mount_opt; + sbi->s_def_mount_opt2 = sbi->s_mount_opt2; + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) + goto failed_mount; + + ext4_apply_options(fc, sb); + + err = ext4_encoding_init(sb, es); + if (err) + goto failed_mount; + + err = ext4_check_journal_data_mode(sb); + if (err) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + + /* HSM events are allowed by default. */ + sb->s_iflags |= SB_I_ALLOW_HSM; + + err = ext4_check_feature_compatibility(sb, es, silent); + if (err) + goto failed_mount; + + err = ext4_block_group_meta_init(sb, silent); + if (err) + goto failed_mount; + + err = ext4_hash_info_init(sb); + if (err) + goto failed_mount; + + err = ext4_handle_clustersize(sb); + if (err) + goto failed_mount; + + err = ext4_check_geometry(sb, es); + if (err) + goto failed_mount; + + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + spin_lock_init(&sbi->s_error_lock); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); + + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) + goto failed_mount3; + + err = ext4_es_register_shrinker(sbi); + if (err) + goto failed_mount3; + + sbi->s_stripe = ext4_get_stripe_size(sbi); + if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + sbi->s_stripe, sbi->s_cluster_ratio); + sbi->s_stripe = 0; + } + sbi->s_extent_max_zeroout_kb = 32; + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext4_sops; + sb->s_export_op = &ext4_export_ops; + sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_FS_ENCRYPTION + sb->s_cop = &ext4_cryptops; +#endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &ext4_verityops; +#endif +#ifdef CONFIG_QUOTA + sb->dq_op = &ext4_quota_operations; + if (ext4_has_feature_quota(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &ext4_qctl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); + + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + + spin_lock_init(&sbi->s_bdev_wb_lock); + + ext4_atomic_write_init(sb); + ext4_fast_commit_init(sb); + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + ext4_has_feature_orphan_present(sb) || + ext4_has_feature_journal_needs_recovery(sb)); + + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { + err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); + if (err) + goto failed_mount3a; + } + + err = -EINVAL; + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { + err = ext4_load_and_init_journal(sb, es, ctx); + if (err) + goto failed_mount3a; + if (bdev_read_only(sb->s_bdev)) + needs_recovery = 0; + } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && + ext4_has_feature_journal_needs_recovery(sb)) { + ext4_msg(sb, KERN_ERR, "required journal recovery " + "suppressed and not mounted read-only"); + goto failed_mount3a; + } else { + const char *journal_option; + + /* Nojournal mode, all journal mount options are illegal */ + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); + goto failed_mount3a; + } + + sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; + clear_opt(sb, JOURNAL_CHECKSUM); + clear_opt(sb, DATA_FLAGS); + clear_opt2(sb, JOURNAL_FAST_COMMIT); + sbi->s_journal = NULL; + needs_recovery = 0; + } + + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_block_cache"); + err = -EINVAL; + goto failed_mount_wq; + } + + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + err = -EINVAL; + goto failed_mount_wq; + } + } + } + + /* + * Get the # of file system overhead blocks from the + * superblock if present. + */ + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { + err = ext4_calculate_overhead(sb); + if (err) + goto failed_mount_wq; + } + + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->rsv_conversion_wq = + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->rsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + err = -ENOMEM; + goto failed_mount4; + } + + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + err = PTR_ERR(root); + root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + iput(root); + err = -EFSCORRUPTED; + goto failed_mount4; + } + + generic_set_sb_d_ops(sb); + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); + err = -ENOMEM; + goto failed_mount4; + } + + err = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (err == -EROFS) { + sb->s_flags |= SB_RDONLY; + } else if (err) + goto failed_mount4a; + + ext4_set_resv_clusters(sb); + + if (test_opt(sb, BLOCK_VALIDITY)) { + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } + } + ext4_fc_replay_cleanup(sb); + + ext4_ext_init(sb); + + /* + * Enable optimize_scan if number of groups is > threshold. This can be + * turned off by passing "mb_optimize_scan=0". This can also be + * turned on forcefully by passing "mb_optimize_scan=1". + */ + if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { + if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) + set_opt2(sb, MB_OPTIMIZE_SCAN); + else + clear_opt2(sb, MB_OPTIMIZE_SCAN); + } + + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount5; + } + + /* + * We can only set up the journal commit callback once + * mballoc is initialized + */ + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = + ext4_journal_commit_callback; + + err = ext4_percpu_param_init(sbi); + if (err) + goto failed_mount6; + + if (ext4_has_feature_flex_bg(sb)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + err = -ENOMEM; + goto failed_mount6; + } + + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount6; + + err = ext4_init_orphan_info(sb); + if (err) + goto failed_mount7; +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount. */ + if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { + err = ext4_enable_quotas(sb); + if (err) + goto failed_mount8; + } +#endif /* CONFIG_QUOTA */ + + /* + * Save the original bdev mapping's wb_err value which could be + * used to detect the metadata async write error. + */ + errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, + &sbi->s_bdev_wb_err); + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); + EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + /* + * Update the checksum after updating free space/inode counters and + * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); + if (needs_recovery) { + ext4_msg(sb, KERN_INFO, "recovery complete"); + err = ext4_mark_recovery_complete(sb, es); + if (err) + goto failed_mount9; + } + + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } + + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + + /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ + ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + atomic_set(&sbi->s_warning_count, 0); + atomic_set(&sbi->s_msg_count, 0); + + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + + return 0; + +failed_mount9: + ext4_quotas_off(sb, EXT4_MAXQUOTAS); +failed_mount8: __maybe_unused + ext4_release_orphan_info(sb); +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_mb_release(sb); + ext4_flex_groups_free(sbi); + ext4_percpu_param_destroy(sbi); +failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: + dput(sb->s_root); + sb->s_root = NULL; +failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); +failed_mount_wq: + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + + if (sbi->s_journal) { + ext4_journal_destroy(sbi, sbi->s_journal); + } +failed_mount3a: + ext4_es_unregister_shrinker(sbi); +failed_mount3: + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); + ext4_stop_mmpd(sbi); + timer_delete_sync(&sbi->s_err_report); + ext4_group_desc_free(sbi); +failed_mount: +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); +#endif + +#ifdef CONFIG_QUOTA + for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(get_qf_name(sb, sbi, i)); +#endif + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); + brelse(sbi->s_sbh); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + bdev_fput(sbi->s_journal_bdev_file); + } +out_fail: + invalidate_bdev(sb->s_bdev); + sb->s_fs_info = NULL; + return err; +} + +static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi; + const char *descr; + int ret; + + sbi = ext4_alloc_sbi(sb); + if (!sbi) + return -ENOMEM; + + fc->s_fs_info = sbi; + + /* Cleanup superblock name */ + strreplace(sb->s_id, '/', '!'); + + sbi->s_sb_block = 1; /* Default super block location */ + if (ctx->spec & EXT4_SPEC_s_sb_block) + sbi->s_sb_block = ctx->s_sb_block; + + ret = __ext4_fill_super(fc, sb); + if (ret < 0) + goto free_sbi; + + if (sbi->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " + "Quota mode: %s.", &sb->s_uuid, + sb_rdonly(sb) ? "ro" : "r/w", descr, + ext4_quota_mode(sb)); + + /* Update the s_overhead_clusters if necessary */ + ext4_update_overhead(sb, false); + return 0; + +free_sbi: + ext4_free_sbi(sbi); + fc->s_fs_info = NULL; + return ret; +} + +static int ext4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ext4_fill_super); +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; + ext4_fc_init(sb, journal); + + write_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + /* + * Always enable journal cycle record option, letting the journal + * records log transactions continuously between each mount. + */ + journal->j_flags |= JBD2_CYCLE_RECORD; + write_unlock(&journal->j_state_lock); +} + +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + + /* + * Test for the existence of a valid inode on disk. Bad things + * happen if we iget() an unused inode, as the subsequent iput() + * will try to delete it. + */ + journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); + if (IS_ERR(journal_inode)) { + ext4_msg(sb, KERN_ERR, "no journal found"); + return ERR_CAST(journal_inode); + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext4_msg(sb, KERN_ERR, "journal inode is deleted"); + return ERR_PTR(-EFSCORRUPTED); + } + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { + ext4_msg(sb, KERN_ERR, "invalid journal inode"); + iput(journal_inode); + return ERR_PTR(-EFSCORRUPTED); + } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); + return journal_inode; +} + +static int ext4_journal_bmap(journal_t *journal, sector_t *block) +{ + struct ext4_map_blocks map; + int ret; + + if (journal->j_inode == NULL) + return 0; + + map.m_lblk = *block; + map.m_len = 1; + ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); + if (ret <= 0) { + ext4_msg(journal->j_inode->i_sb, KERN_CRIT, + "journal bmap failed: block %llu ret %d\n", + *block, ret); + jbd2_journal_abort(journal, ret ? ret : -EIO); + return ret; + } + *block = map.m_pblk; + return 0; +} + +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + journal_inode = ext4_get_journal_inode(sb, journal_inum); + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); + + journal = jbd2_journal_init_inode(journal_inode); + if (IS_ERR(journal)) { + ext4_msg(sb, KERN_ERR, "Could not load journal inode"); + iput(journal_inode); + return ERR_CAST(journal); + } + journal->j_private = sb; + journal->j_bmap = ext4_journal_bmap; + ext4_init_journal_params(sb, journal); + return journal; +} + +static struct file *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) +{ + struct buffer_head *bh; + struct block_device *bdev; + struct file *bdev_file; + int hblock, blocksize; + ext4_fsblk_t sb_block; + unsigned long offset; + struct ext4_super_block *es; + int errno; + + bdev_file = bdev_file_open_by_dev(j_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; + } + + bdev = file_bdev(bdev_file); + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext4_msg(sb, KERN_ERR, + "blocksize too small for journal device"); + errno = -EINVAL; + goto out_bdev; + } + + sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; + offset = EXT4_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev_file, blocksize); + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { + ext4_msg(sb, KERN_ERR, "couldn't read superblock of " + "external journal"); + errno = -EINVAL; + goto out_bdev; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; + goto out_bh; + } + + if ((le32_to_cpu(es->s_feature_ro_compat) & + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + es->s_checksum != ext4_superblock_csum(es)) { + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; + goto out_bh; + } + + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + errno = -EFSCORRUPTED; + goto out_bh; + } + + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev_file; + +out_bh: + brelse(bh); +out_bdev: + bdev_fput(bdev_file); + return ERR_PTR(errno); +} + +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct file *bdev_file; + int errno = 0; + + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); + + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, + j_len, sb->s_blocksize); + if (IS_ERR(journal)) { + ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); + goto out_bdev; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext4_msg(sb, KERN_ERR, "External journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; + goto out_journal; + } + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; + ext4_init_journal_params(sb, journal); + return journal; + +out_journal: + ext4_journal_destroy(EXT4_SB(sb), journal); +out_bdev: + bdev_fput(bdev_file); + return ERR_PTR(errno); +} + +static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + int journal_dev_ro; + + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return -EFSCORRUPTED; + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext4_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, + "filesystem has both journal inode and journal device!"); + return -EINVAL; + } + + if (journal_inum) { + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); + } else { + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); + } + + journal_dev_ro = bdev_read_only(journal->j_dev); + really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; + + if (journal_dev_ro && !sb_rdonly(sb)) { + ext4_msg(sb, KERN_ERR, + "journal device read-only, try mounting with '-o ro'"); + err = -EROFS; + goto err_out; + } + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + if (ext4_has_feature_journal_needs_recovery(sb)) { + if (sb_rdonly(sb)) { + ext4_msg(sb, KERN_INFO, "INFO: recovery " + "required on readonly filesystem"); + if (really_read_only) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, cannot proceed " + "(try mounting with noload)"); + err = -EROFS; + goto err_out; + } + ext4_msg(sb, KERN_INFO, "write access will " + "be enabled during recovery"); + } + } + + if (!(journal->j_flags & JBD2_BARRIER)) + ext4_msg(sb, KERN_INFO, "barriers disabled"); + + if (!ext4_has_feature_journal_needs_recovery(sb)) + err = jbd2_journal_wipe(journal, !really_read_only); + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + __le16 orig_state; + bool changed = false; + + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); + err = jbd2_journal_load(journal); + if (save && memcmp(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN)) { + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + changed = true; + } + kfree(save); + orig_state = es->s_state; + es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & + EXT4_ERROR_FS); + if (orig_state != es->s_state) + changed = true; + /* Write out restored error information to the superblock */ + if (changed && !really_read_only) { + int err2; + err2 = ext4_commit_super(sb); + err = err ? : err2; + } + } + + if (err) { + ext4_msg(sb, KERN_ERR, "error loading journal"); + goto err_out; + } + + EXT4_SB(sb)->s_journal = journal; + err = ext4_clear_journal_err(sb, es); + if (err) { + ext4_journal_destroy(EXT4_SB(sb), journal); + return err; + } + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + ext4_commit_super(sb); + } + if (!really_read_only && journal_inum && + journal_inum != le32_to_cpu(es->s_journal_inum)) { + es->s_journal_inum = cpu_to_le32(journal_inum); + ext4_commit_super(sb); + } + + return 0; + +err_out: + ext4_journal_destroy(EXT4_SB(sb), journal); + return err; +} + +/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ +static void ext4_update_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *sbh = sbi->s_sbh; + + lock_buffer(sbh); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!sb_rdonly(sb)) + ext4_update_tstamp(es, s_wtime); + es->s_kbytes_written = + cpu_to_le64(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1)); + if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) + ext4_free_blocks_count_set(es, + EXT4_C2B(sbi, percpu_counter_sum_positive( + &sbi->s_freeclusters_counter))); + if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &sbi->s_freeinodes_counter)); + /* Copy error information to the on-disk superblock */ + spin_lock(&sbi->s_error_lock); + if (sbi->s_add_error_count > 0) { + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + if (!es->s_first_error_time && !es->s_first_error_time_hi) { + __ext4_update_tstamp(&es->s_first_error_time, + &es->s_first_error_time_hi, + sbi->s_first_error_time); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); + es->s_first_error_line = + cpu_to_le32(sbi->s_first_error_line); + es->s_first_error_ino = + cpu_to_le32(sbi->s_first_error_ino); + es->s_first_error_block = + cpu_to_le64(sbi->s_first_error_block); + es->s_first_error_errcode = + ext4_errno_to_code(sbi->s_first_error_code); + } + __ext4_update_tstamp(&es->s_last_error_time, + &es->s_last_error_time_hi, + sbi->s_last_error_time); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); + es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); + es->s_last_error_errcode = + ext4_errno_to_code(sbi->s_last_error_code); + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); + le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); + sbi->s_add_error_count = 0; + } + spin_unlock(&sbi->s_error_lock); + + ext4_superblock_csum_set(sb); + unlock_buffer(sbh); +} + +static int ext4_commit_super(struct super_block *sb) +{ + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + + if (!sbh) + return -EINVAL; + + ext4_update_super(sb); + + lock_buffer(sbh); + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(sbh)) { + unlock_buffer(sbh); + return -EIO; + } + + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + get_bh(sbh); + /* Clear potential dirty bit if it was journalled update */ + clear_buffer_dirty(sbh); + sbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE | REQ_SYNC | + (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + wait_on_buffer(sbh); + if (buffer_write_io_error(sbh)) { + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + return -EIO; + } + return 0; +} + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static int ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) +{ + int err; + journal_t *journal = EXT4_SB(sb)->s_journal; + + if (!ext4_has_feature_journal(sb)) { + if (journal != NULL) { + ext4_error(sb, "Journal got removed while the fs was " + "mounted!"); + return -EFSCORRUPTED; + } + return 0; + } + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal, 0); + if (err < 0) + goto out; + + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || + ext4_has_feature_orphan_present(sb))) { + if (!ext4_orphan_file_empty(sb)) { + ext4_error(sb, "Orphan file not empty on read-only fs."); + err = -EFSCORRUPTED; + goto out; + } + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + ext4_commit_super(sb); + } +out: + jbd2_journal_unlock_updates(journal); + return err; +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static int ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + if (!ext4_has_feature_journal(sb)) { + ext4_error(sb, "Journal got removed while the fs was mounted!"); + return -EFSCORRUPTED; + } + + journal = EXT4_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext4_error() or ext4_abort() + */ + + j_errno = jbd2_journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext4_decode_error(sb, j_errno, nbuf); + ext4_warning(sb, "Filesystem error recorded " + "from previous mount: %s", errstr); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + j_errno = ext4_commit_super(sb); + if (j_errno) + return j_errno; + ext4_warning(sb, "Marked fs in need of filesystem check."); + + jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); + } + return 0; +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext4_force_commit(struct super_block *sb) +{ + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); +} + +static int ext4_sync_fs(struct super_block *sb, int wait) +{ + int ret = 0; + tid_t target; + bool needs_barrier = false; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(sbi->rsv_conversion_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); + /* + * Data writeback is possible w/o journal transaction, so barrier must + * being sent at the end of the function. But we can skip it if + * transaction_commit will do it for us. + */ + if (sbi->s_journal) { + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + ret = jbd2_log_wait_commit(sbi->s_journal, + target); + } + } else if (wait && test_opt(sb, BARRIER)) + needs_barrier = true; + if (needs_barrier) { + int err; + err = blkdev_issue_flush(sb->s_bdev); + if (!ret) + ret = err; + } + + return ret; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + * + * Note that only this function cannot bring a filesystem to be in a clean + * state independently. It relies on upper layer to stop all data & metadata + * modifications. + */ +static int ext4_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal = EXT4_SB(sb)->s_journal; + + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); + + /* + * Don't clear the needs_recovery flag if we failed to + * flush the journal. + */ + error = jbd2_journal_flush(journal, 0); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + ext4_clear_feature_journal_needs_recovery(sb); + if (ext4_orphan_file_empty(sb)) + ext4_clear_feature_orphan_present(sb); + } + + error = ext4_commit_super(sb); +out: + if (journal) + /* we rely on upper layer to stop further updates */ + jbd2_journal_unlock_updates(journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext4_unfreeze(struct super_block *sb) +{ + if (ext4_emergency_state(sb)) + return 0; + + if (EXT4_SB(sb)->s_journal) { + /* Reset the needs_recovery flag before the fs is unlocked. */ + ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } + + ext4_commit_super(sb); + return 0; +} + +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[EXT4_MAXQUOTAS]; +#endif +}; + +static int __ext4_remount(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_super_block *es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + ext4_group_t g; + int err = 0; + int alloc_ctx; +#ifdef CONFIG_QUOTA + int enable_quota = 0; + int i, j; + char *to_free[EXT4_MAXQUOTAS]; +#endif + + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < EXT4_MAXQUOTAS; i++) + if (sbi->s_qf_names[i]) { + char *qf_name = get_qf_name(sb, sbi, i); + + old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; +#endif + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + ctx->journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; + else + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + + } + + if ((ctx->spec & EXT4_SPEC_s_stripe) && + ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + ctx->s_stripe, sbi->s_cluster_ratio); + ctx->s_stripe = 0; + } + + /* + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause + * two calls to ext4_should_dioread_nolock() to return inconsistent + * values, triggering WARN_ON in ext4_add_complete_io(). we grab + * here s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + alloc_ctx = ext4_writepages_down_write(sb); + ext4_apply_options(fc, sb); + ext4_writepages_up_write(sb, alloc_ctx); + + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ + test_opt(sb, JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "changing journal_checksum " + "during remount not supported; ignoring"); + sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + err = -EINVAL; + goto restore_opts; + } + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { + ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); + err = -EINVAL; + goto restore_opts; + } + + if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && + !test_opt(sb, DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); + err = -EINVAL; + goto restore_opts; + } + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + + es = sbi->s_es; + + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); + } + + /* Flush outstanding errors before changing fs state */ + flush_work(&sbi->s_sb_upd_work); + + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { + if (ext4_emergency_state(sb)) { + err = -EROFS; + goto restore_opts; + } + + if (fc->sb_flags & SB_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= SB_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && + (sbi->s_mount_state & EXT4_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + if (sbi->s_journal) { + /* + * We let remount-ro finish even if marking fs + * as clean failed... + */ + ext4_mark_recovery_complete(sb, es); + } + } else { + /* Make sure we can mount this feature set readwrite */ + if (ext4_has_feature_readonly(sb) || + !ext4_feature_set_ok(sb, 0)) { + err = -EROFS; + goto restore_opts; + } + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sb, g, gdp)) { + ext4_msg(sb, KERN_ERR, + "ext4_remount: Checksum for group %u failed (%u!=%u)", + g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EFSBADCRC; + goto restore_opts; + } + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { + ext4_msg(sb, KERN_WARNING, "Couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead"); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + if (sbi->s_journal) { + err = ext4_clear_journal_err(sb, es); + if (err) + goto restore_opts; + } + sbi->s_mount_state = (le16_to_cpu(es->s_state) & + ~EXT4_FC_REPLAY); + + err = ext4_setup_super(sb, es, 0); + if (err) + goto restore_opts; + + sb->s_flags &= ~SB_RDONLY; + if (ext4_has_feature_mmp(sb)) { + err = ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block)); + if (err) + goto restore_opts; + } +#ifdef CONFIG_QUOTA + enable_quota = 1; +#endif + } + } + + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + } + + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb); + if (err) + goto restore_opts; + } + +#ifdef CONFIG_QUOTA + if (enable_quota) { + if (sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); + else if (ext4_has_feature_quota(sb)) { + err = ext4_enable_quotas(sb); + if (err) + goto restore_opts; + } + } + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); +#endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) + ext4_release_system_zone(sb); + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); + + /* + * Handle aborting the filesystem as the last thing during remount to + * avoid obsure errors during remount when some option changes fail to + * apply due to shutdown filesystem. + */ + if (test_opt2(sb, ABORT)) + ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + + return 0; + +restore_opts: + /* + * If there was a failing r/w to ro transition, we may need to + * re-enable quota + */ + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && + sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); + + alloc_ctx = ext4_writepages_down_write(sb); + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; + ext4_writepages_up_write(sb, alloc_ctx); + + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) + ext4_release_system_zone(sb); +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + to_free[i] = get_qf_name(sb, sbi, i); + rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); + } + synchronize_rcu(); + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(to_free[i]); +#endif + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); + return err; +} + +static int ext4_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + int ret; + bool old_ro = sb_rdonly(sb); + + fc->s_fs_info = EXT4_SB(sb); + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) + return ret; + + ret = __ext4_remount(fc, sb); + if (ret < 0) + return ret; + + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); + + return 0; +} + +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dquot->dq_dqb_lock); + + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + limit >>= sb->s_blocksize_bits; + + if (limit) { + uint64_t remaining = 0; + + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); + } + + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); + } + + spin_unlock(&dquot->dq_dqb_lock); + dqput(dquot); + return 0; +} +#endif + +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t overhead = 0, resv_blocks; + s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); + + if (!test_opt(sb, MINIX_DF)) + overhead = sbi->s_overhead; + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT4_NAME_LEN; + buf->f_fsid = uuid_to_fsid(es->s_uuid); + +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif + return 0; +} + + +#ifdef CONFIG_QUOTA + +/* + * Helper functions so that transaction is started before we acquire dqio_sem + * to keep correct lock ordering of transaction > dqio_sem + */ +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; +} + +static int ext4_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to commit dquot type %d", + dquot->dq_id.type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to acquire dquot type %d", + dquot->dq_id.type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } + + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to release dquot type %d", + dquot->dq_id.type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + + return ret; +} + +static int ext4_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + + if (ext4_is_quota_journalled(sb)) { + dquot_mark_dquot_dirty(dquot); + return ext4_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext4_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static void lockdep_set_quota_inode(struct inode *inode, int subclass) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + /* The first argument of lockdep_set_subclass has to be + * *exactly* the same as the argument to init_rwsem() --- in + * this case, in init_once() --- or lockdep gets unhappy + * because the name of the lock is set using the + * stringification of the argument to init_rwsem(). + */ + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, subclass); +} + +/* + * Standard function to be called on quota_on + */ +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) + return -EXDEV; + + /* Quota already enabled for this file? */ + if (IS_NOQUOTA(d_inode(path->dentry))) + return -EBUSY; + + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not in fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext4_msg(sb, KERN_WARNING, + "Quota file not on filesystem root. " + "Journaled quota will not work"); + sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; + } else { + /* + * Clear the flag just in case mount options changed since + * last time. + */ + sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; + } + + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); + err = dquot_quota_on(sb, type, format_id, path); + if (!err) { + struct inode *inode = d_inode(path->dentry); + handle_t *handle; + + /* + * Set inode flags to prevent userspace from messing with quota + * files. If this fails, we return success anyway since quotas + * are already enabled and this is not a hard failure. + */ + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) + goto unlock_inode; + EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + err = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + unlock_inode: + inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); + } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); + return err; +} + +static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + int err; + struct inode *qf_inode; + unsigned long qf_inums[EXT4_MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) + }; + + BUG_ON(!ext4_has_feature_quota(sb)); + + if (!qf_inums[type]) + return -EPERM; + + if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); + if (IS_ERR(qf_inode)) { + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); + if (err) + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); + + return err; +} + +/* Enable usage tracking for all quota types. */ +int ext4_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inums[EXT4_MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) + }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < EXT4_MAXQUOTAS; type++) { + if (qf_inums[type]) { + err = ext4_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + ext4_warning(sb, + "Failed to enable quota tracking " + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); + + ext4_quotas_off(sb, type); + return err; + } + } + } + return 0; +} + +static int ext4_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + handle_t *handle; + int err; + + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) + sync_filesystem(sb); + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err || ext4_has_feature_quota(sb)) + goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; + + inode_lock(inode); + /* + * Update modification times of quota files when userspace can + * start looking at them. If we fail, we return success anyway since + * this is not a hard failure and quotas are already disabled. + */ + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_unlock; + } + EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + err = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +out_unlock: + inode_unlock(inode); +out_put: + lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); + bh = ext4_bread(NULL, inode, blk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); + int retries = 0; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (!handle) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + do { + bh = ext4_bread(handle, inode, blk, + EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } while (PTR_ERR(bh) == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + goto out; + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) { + brelse(bh); + return err; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_folio(bh->b_folio); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); +out: + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT4_I(inode)->i_disksize = inode->i_size; + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; + } + return err ? err : len; +} +#endif + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (ext4_has_unknown_ext2_incompat_features(sb)) + return 0; + if (sb_rdonly(sb)) + return 1; + if (ext4_has_unknown_ext2_ro_compat_features(sb)) + return 0; + return 1; +} +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (ext4_has_unknown_ext3_incompat_features(sb)) + return 0; + if (!ext4_has_feature_journal(sb)) + return 0; + if (sb_rdonly(sb)) + return 1; + if (ext4_has_unknown_ext3_ro_compat_features(sb)) + return 0; + return 1; +} + +static void ext4_kill_sb(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; + + kill_block_super(sb); + + if (bdev_file) + bdev_fput(bdev_file); +} + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = ext4_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, +}; +MODULE_ALIAS_FS("ext4"); + +static int __init ext4_init_fs(void) +{ + int err; + + ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); + ext4_li_info = NULL; + + /* Build-time check for flags consistency */ + ext4_check_flag_values(); + + err = ext4_init_es(); + if (err) + return err; + + err = ext4_init_pending(); + if (err) + goto out7; + + err = ext4_init_post_read_processing(); + if (err) + goto out6; + + err = ext4_init_pageio(); + if (err) + goto out5; + + err = ext4_init_system_zone(); + if (err) + goto out4; + + err = ext4_init_sysfs(); + if (err) + goto out3; + + err = ext4_init_mballoc(); + if (err) + goto out2; + err = init_inodecache(); + if (err) + goto out1; + + err = ext4_fc_init_dentry_cache(); + if (err) + goto out05; + + register_as_ext3(); + register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; +out: + unregister_as_ext2(); + unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); +out05: + destroy_inodecache(); +out1: + ext4_exit_mballoc(); +out2: + ext4_exit_sysfs(); +out3: + ext4_exit_system_zone(); +out4: + ext4_exit_pageio(); +out5: + ext4_exit_post_read_processing(); +out6: + ext4_exit_pending(); +out7: + ext4_exit_es(); + + return err; +} + +static void __exit ext4_exit_fs(void) +{ + ext4_destroy_lazyinit_thread(); + unregister_as_ext2(); + unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); + destroy_inodecache(); + ext4_exit_mballoc(); + ext4_exit_sysfs(); + ext4_exit_system_zone(); + ext4_exit_pageio(); + ext4_exit_post_read_processing(); + ext4_exit_es(); + ext4_exit_pending(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); +MODULE_LICENSE("GPL"); +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy ext4.h from Linux v6.18 fs/ext4 directory. This is the main header file defining structures, constants, and function prototypes for ext4. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/ext4.h | 3932 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3932 insertions(+) create mode 100644 fs/ext4l/ext4.h diff --git a/fs/ext4l/ext4.h b/fs/ext4l/ext4.h new file mode 100644 index 00000000000..57087da6c7b --- /dev/null +++ b/fs/ext4l/ext4.h @@ -0,0 +1,3932 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include <linux/refcount.h> +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/magic.h> +#include <linux/jbd2.h> +#include <linux/quota.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/sched/signal.h> +#include <linux/blockgroup_lock.h> +#include <linux/percpu_counter.h> +#include <linux/ratelimit.h> +#include <linux/crc32c.h> +#include <linux/falloc.h> +#include <linux/percpu-rwsem.h> +#include <linux/fiemap.h> +#ifdef __KERNEL__ +#include <linux/compat.h> +#endif +#include <uapi/linux/ext4.h> + +#include <linux/fscrypt.h> +#include <linux/fsverity.h> + +#include <linux/compiler.h> + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ +#define EXT_DEBUG__ + +/* + * Dynamic printk for controlled extents debugging. + */ +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) +#else +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define ASSERT(assert) \ +do { \ + if (unlikely(!(assert))) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: '%s'\n", \ + __func__, __FILE__, __LINE__, #assert); \ + BUG(); \ + } \ +} while (0) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * For each criteria, mballoc has slightly different way of finding + * the required blocks nad usually, higher the criteria the slower the + * allocation. We start at lower criterias and keep falling back to + * higher ones if we are not able to find any blocks. Lower (earlier) + * criteria are faster. + */ +enum criteria { + /* + * Used when number of blocks needed is a power of 2. This + * doesn't trigger any disk IO except prefetch and is the + * fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most + * suitable group that satisfies goal request. No disk IO + * except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal + * length to the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if + * necessary, to find suitable block group. Tries to + * allocate goal length but might trim the request if nothing + * is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates + * those. This is only used in rare cases when + * CR_GOAL_LEN_SLOW also fails to allocate anything. + */ + CR_ANY_FREE, + + /* + * Number of criterias defined. + */ + EXT4_MB_NUM_CRS +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) + +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + refcount_t count; /* reference counter */ + struct list_head list_vec; /* list of ext4_io_end_vec */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ +/* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); + CHECK_FLAG_VALUE(RESERVED); +} + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) + /* Caller is in the atomic contex, find extent if it has been cached */ +#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) +{ + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) +{ + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; +} + +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) + +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) + +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ +do { \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" +#include "fast_commit.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + * I_DATA_SEM_EA - Used for ea_inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, + I_DATA_SEM_EA +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_rwsem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; + + /* Fast commit related info */ + + /* For tracking dentry create updates */ + struct list_head i_fc_dilist; + struct list_head i_fc_list; /* + * inodes that need fast commit + * protected by sbi->s_fc_lock. + */ + + /* Start of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_start; + + /* End of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_len; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* Fast commit wait queue for this inode */ + wait_queue_head_t i_fc_wait; + + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + spinlock_t i_block_reservation_lock; + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot __rcu *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ +#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX_ALWAYS 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void mb_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +#define EXT4_LABEL_MAX 16 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le16 s_def_resuid_hi; + __le16 s_def_resgid_hi; + __le32 s_reserved[93]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + atomic_t ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + /* Array of bh's for the block group descriptors */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned long s_mount_flags; + unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + struct buffer_head *s_mmp_bh; + + /* Journaling */ + struct journal_s *s_journal; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct file *s_journal_bdev_file; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *s_system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + atomic_t s_mb_free_pending; + struct list_head s_freed_data_list[2]; /* List of blocks to be freed + after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; + struct xarray *s_mb_avg_fragment_size; + struct xarray *s_mb_largest_free_orders; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; + unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; + + /* where last allocation was done - for stream allocation */ + ext4_group_t *s_mb_last_groups; + unsigned int s_mb_nr_global_goals; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + unsigned long s_last_trim_minblks; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker *s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; + + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; + u64 s_dax_part_off; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; + + /* Information about errors that happened during this mount */ + spinlock_t s_error_lock; + int s_add_error_count; + int s_first_error_code; + __u32 s_first_error_line; + __u32 s_first_error_ino; + __u64 s_first_error_block; + const char *s_first_error_func; + time64_t s_first_error_time; + int s_last_error_code; + __u32 s_last_error_line; + __u32 s_last_error_ino; + __u64 s_last_error_block; + const char *s_last_error_func; + time64_t s_last_error_time; + /* + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. + */ + struct work_struct s_sb_upd_work; + + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + + /* Ext4 fast commit sub transaction ID */ + atomic_t s_fc_subtid; + + /* + * After commit starts, the main queue gets locked, and the further + * updates get added in the staging queue. + */ +#define FC_Q_MAIN 0 +#define FC_Q_STAGING 1 + struct list_head s_fc_q[2]; /* Inodes staged for fast commit + * that have data changes in them. + */ + struct list_head s_fc_dentry_q[2]; /* directory entry updates */ + unsigned int s_fc_bytes; + /* + * Main fast commit lock. This lock protects accesses to the + * following fields: + * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + */ + struct mutex s_fc_lock; + struct buffer_head *s_fc_bh; + struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + struct ext4_fc_replay_state s_fc_replay_state; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +static inline int ext4_get_resuid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resuid) | + le16_to_cpu(es->s_def_resuid_hi) << 16; +} + +static inline int ext4_get_resgid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resgid) | + le16_to_cpu(es->s_def_resgid_hi) << 16; +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Check whether the inode is tracked as orphan (either in orphan file or + * orphan list). + */ +static inline bool ext4_inode_orphan_tracked(struct inode *inode) +{ + return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan); +} + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ +#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 +#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) +EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + +/* + * Superblock flags + */ +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; + +static inline int ext4_forced_shutdown(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_ro(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; +} + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times and ioprio. + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + + +/* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; /* See file type macros EXT4_FT_* below */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH + +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) +{ + return crc32c(crc, address, length); +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct qstr cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; + u64 cookie; + bool initialized; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + +struct ext4_li_request { + struct super_block *lr_super; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +void ext4_block_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group, + bool ignore_locked); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#if IS_ENABLED(CONFIG_UNICODE) +extern int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname); + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +} +#else +static inline int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname) +{ + return 0; +} + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ +} +#endif + +/* ext4 encryption related stuff goes here crypto.c */ +#ifdef CONFIG_FS_ENCRYPTION +extern const struct fscrypt_operations ext4_cryptops; + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname); + +void ext4_fname_free_filename(struct ext4_filename *fname); + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); + +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + + return ext4_fname_setup_ci_filename(dir, iname, fname); +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + ext4_fname_free_ci_filename(fname); +} + +static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern int ext4_mark_inode_used(struct super_block *sb, int ino); +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, + struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ + (goal), (owner), i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* fast_commit.c */ +int ext4_fc_info_show(struct seq_file *seq, void *v); +void ext4_fc_init(struct super_block *sb, journal_t *journal); +void ext4_fc_init_inode(struct inode *inode); +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); +void ext4_fc_del(struct inode *inode); +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); +void ext4_fc_replay_cleanup(struct super_block *sb); +int ext4_fc_commit(journal_t *journal, tid_t commit_tid); +int __init ext4_fc_init_dentry_cache(void); +void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); +extern int ext4_mb_init(struct super_block *); +extern void ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, bool state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} + +/* inode.c */ +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); +int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, struct inode *inode, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +void ext4_set_inode_mapping_order(struct inode *inode); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ + EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, + struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); +extern int ext4_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern void ext4_set_inode_flags(struct inode *, bool init); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +int ext4_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); +extern void ext4_reset_inode_seed(struct inode *inode); +int ext4_update_overhead(struct super_block *sb, bool force); +int ext4_force_shutdown(struct super_block *sb, u32 flags); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode); +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); +extern unsigned int ext4_list_backups(struct super_block *sb, + unsigned int *three, unsigned int *five, + unsigned int *seven); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, blk_opf_t op_flags); +extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block); +extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block); +extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); +extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); + +extern __printf(7, 8) +void __ext4_error(struct super_block *, const char *, unsigned int, bool, + int, __u64, const char *, ...); +extern __printf(6, 7) +void __ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, int, const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) + +#define ext4_error_inode_block(inode, block, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ + (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#define ext4_abort(sb, err, fmt, a...) \ + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ + __ext4_error_inode((inode), (func), (line), (block), \ + (err), (fmt), ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_error_err(sb, err, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, 0, " "); \ +} while (0) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, err, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, 0, 0, " "); \ +} while (0) +#define ext4_error_err(sb, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, err, 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); +} + +#define ext4_read_incompat_64bit_val(es, name) \ + (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ + ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ + le32_to_cpu(es->name##_lo)) + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_blocks_count); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_r_blocks_count); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_free_blocks_count); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; +#ifdef AGGRESSIVE_CHECK + unsigned long bb_check_counter; +#endif + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) +{ + if (!spin_trylock(ext4_group_lock_ptr(sb, group))) + return false; + /* + * We're able to grab the lock right away, so drop the lock + * contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + return true; +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + if (!ext4_try_lock_group(sb, group)) { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(ext4_group_lock_ptr(sb, group)); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +#ifdef CONFIG_QUOTA +static inline bool ext4_quota_capable(struct super_block *sb) +{ + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); +} + +static inline bool ext4_is_quota_journalled(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + return (ext4_has_feature_quota(sb) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); +} +int ext4_enable_quotas(struct super_block *sb); +#endif + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); +extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); + +int ext4_readpage_inline(struct inode *inode, struct folio *folio); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern void *ext4_read_inline_link(struct inode *inode); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *dir_block, + unsigned int parent_ino, void *inline_buf, + int inline_size); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); +extern int __ext4_link(struct inode *dir, struct inode *inode, + struct dentry *dentry); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct folio *folio); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); +extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, + ext4_fsblk_t start_blk, unsigned int count); + + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern struct ext4_ext_path *ext4_ext_insert_extent( + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *, + int flags); +extern void ext4_free_ext_path(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); +extern int ext4_ext_replay_set_iblocks(struct inode *inode); +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk); +extern int ext4_ext_clear_bb(struct inode *inode); + + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* mmp.c */ +extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +extern int ext4_resize_begin(struct super_block *sb); +extern int ext4_resize_end(struct super_block *sb, bool update_backups); + +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) + io_end->flag |= EXT4_IO_END_UNWRITTEN; +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; +} + +extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; +extern const struct iomap_ops ext4_iomap_report_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + +extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block); +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy acl.c, acl.h, balloc.c, bitmap.c, block_validity.c, crypto.c, and dir.c from Linux v6.18 fs/ext4 directory. - acl: POSIX ACL support - balloc: block allocation helpers - bitmap: bitmap manipulation utilities - block_validity: block validity checking - crypto: filesystem encryption support - dir: directory entry handling Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/acl.c | 304 +++++++++++ fs/ext4l/acl.h | 74 +++ fs/ext4l/balloc.c | 1003 +++++++++++++++++++++++++++++++++++++ fs/ext4l/bitmap.c | 99 ++++ fs/ext4l/block_validity.c | 370 ++++++++++++++ fs/ext4l/crypto.c | 241 +++++++++ fs/ext4l/dir.c | 693 +++++++++++++++++++++++++ 7 files changed, 2784 insertions(+) create mode 100644 fs/ext4l/acl.c create mode 100644 fs/ext4l/acl.h create mode 100644 fs/ext4l/balloc.c create mode 100644 fs/ext4l/bitmap.c create mode 100644 fs/ext4l/block_validity.c create mode 100644 fs/ext4l/crypto.c create mode 100644 fs/ext4l/dir.c diff --git a/fs/ext4l/acl.c b/fs/ext4l/acl.c new file mode 100644 index 00000000000..3bffe862f95 --- /dev/null +++ b/fs/ext4l/acl.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + */ + +#include <linux/quotaops.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext4_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_rwsem: don't care + */ +struct posix_acl * +ext4_get_acl(struct inode *inode, int type, bool rcu) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_rwsem: down unless called from ext4_new_inode + */ +static int +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl, int xattr_flags) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, xattr_flags); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + handle_t *handle; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + struct inode *inode = d_inode(dentry); + umode_t mode = inode->i_mode; + int update_mode = 0; + + error = dquot_initialize(inode); + if (error) + return error; +retry: + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(idmap, inode, &mode, &acl); + if (error) + goto out_stop; + if (mode != inode->i_mode) + update_mode = 1; + } + + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); + if (!error && update_mode) { + inode->i_mode = mode; + inode_set_ctime_current(inode); + error = ext4_mark_inode_dirty(handle, inode); + } +out_stop: + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_rwsem: down + * inode->i_rwsem: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl, XATTR_CREATE); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl, XATTR_CREATE); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + return error; +} diff --git a/fs/ext4l/acl.h b/fs/ext4l/acl.h new file mode 100644 index 00000000000..0c5a79c3b5d --- /dev/null +++ b/fs/ext4l/acl.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/posix_acl_xattr.h> + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* acl.c */ +struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); +int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include <linux/sched.h> +#define ext4_get_acl NULL +#define ext4_set_acl NULL + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/fs/ext4l/balloc.c b/fs/ext4l/balloc.c new file mode 100644 index 00000000000..c9329ed5c09 --- /dev/null +++ b/fs/ext4l/balloc.c @@ -0,0 +1,1003 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "mballoc.h" + +#include <trace/events/ext4.h> +#include <kunit/static_stub.h> + +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (block - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; +} + +/* + * Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned base_clusters, num_clusters; + int block_cluster = -1, inode_cluster; + int itbl_cluster_start = -1, itbl_cluster_end = -1; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; + ext4_fsblk_t itbl_blk_start, itbl_blk_end; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + base_clusters = ext4_num_base_meta_clusters(sb, block_group); + num_clusters = base_clusters; + + /* + * Account and record inode table clusters if any cluster + * is in the block group, or inode table cluster range is + * [-1, -1] and won't overlap with block/inode bitmap cluster + * accounted below. + */ + itbl_blk_start = ext4_inode_table(sb, gdp); + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; + if (itbl_blk_start <= end && itbl_blk_end >= start) { + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); + + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); + + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; + /* check if border cluster is overlapped */ + if (itbl_cluster_start == base_clusters - 1) + num_clusters--; + } + + /* + * For the allocation bitmaps, we first need to check to see + * if the block is in the block group. If it is, then check + * to see if the cluster is already accounted for in the clusters + * used for the base metadata cluster and inode tables cluster. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); + if (block_cluster >= base_clusters && + (block_cluster < itbl_cluster_start || + block_cluster > itbl_cluster_end)) + num_clusters++; + } + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + ext4_inode_bitmap(sb, gdp) - start); + /* + * Additional check if inode bitmap is in just accounted + * block_cluster + */ + if (inode_cluster != block_cluster && + inode_cluster >= base_clusters && + (inode_cluster < itbl_cluster_start || + inode_cluster > itbl_cluster_end)) + num_clusters++; + } + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { + /* + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. + */ + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} + +/* Initializes an uninitialized block bitmap */ +static int ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + + ASSERT(buffer_locked(bh)); + + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + memset(bh->b_data, 0, sb->s_blocksize); + + bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + } + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); + return 0; +} + +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned int group_desc; + unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh_p; + + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); + + return NULL; + } + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); + /* + * sbi_array_rcu_deref returns with rcu unlocked, this is ok since + * the pointer being dereferenced won't be dereferenced again. By + * looking at the usage in add_new_gdb() the value isn't modified, + * just the pointer, and so it remains valid. + */ + if (!bh_p) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)bh_p->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = bh_p; + return desc; +} + +static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + + if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) + return NULL; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. + */ +static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_fsblk_t blk; + ext4_fsblk_t group_first_block; + + if (ext4_has_feature_flex_bg(sb)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 0; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + blk = ext4_block_bitmap(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode bitmap block number is set */ + blk = ext4_inode_bitmap(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode table block number is set */ + blk = ext4_inode_table(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) + return blk; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) + /* bad bitmap for inode tables */ + return blk; + return 0; +} + +static int ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + grp = ext4_get_group_info(sb, block_group); + + if (buffer_verified(bh)) + return 0; + if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; + } + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } + set_buffer_verified(bh); +verified: + ext4_unlock_group(sb, block_group); + return 0; +} + +/** + * ext4_read_block_bitmap_nowait() + * @sb: super block + * @block_group: given block group + * @ignore_locked: ignore locked buffers + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or an ERR_PTR in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh; + ext4_fsblk_t bitmap_blk; + int err; + + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return ERR_PTR(-EFSCORRUPTED); + bitmap_blk = ext4_block_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid block bitmap block %llu in " + "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return ERR_PTR(-EFSCORRUPTED); + } + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_warning(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return ERR_PTR(-ENOMEM); + } + + if (ignore_locked && buffer_locked(bh)) { + /* buffer under IO already, return if called for prefetching */ + put_bh(bh); + return NULL; + } + + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + ext4_lock_group(sb, block_group); + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Block bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } + err = ext4_init_block_bitmap(sb, bh, block_group, desc); + if (err) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Failed to init block bitmap for group " + "%u: %d", block_group, err); + goto out; + } + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + set_buffer_new(bh); + trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | + (ignore_locked ? REQ_RAHEAD : 0), + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); + return bh; +verify: + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: + put_bh(bh); + return ERR_PTR(err); +} + +/* Returns 0 on success, -errno on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return -EFSCORRUPTED; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext4_error_err(sb, EIO, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EIO; + } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ + return ext4_validate_block_bitmap(sb, desc, block_group, bh); +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + int err; + + bh = ext4_read_block_bitmap_nowait(sb, block_group, false); + if (IS_ERR(bh)) + return bh; + err = ext4_wait_block_bitmap(sb, block_group, bh); + if (err) { + put_bh(bh); + return ERR_PTR(err); + } + return bh; +} + +/** + * ext4_has_free_clusters() + * @sbi: in-core super block structure. + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() + * + * Check if filesystem has nclusters free & available for allocation. + * On success return 1, return 0 on failure. + */ +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + s64 free_clusters, dirty_clusters, rsv, resv_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; + + if (free_clusters - (nclusters + rsv + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = percpu_counter_sum_positive(fcc); + dirty_clusters = percpu_counter_sum_positive(dcc); + } + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. + */ + if (free_clusters >= (rsv + nclusters + dirty_clusters)) + return 1; + + /* Hm, nope. Are (enough) root reserved clusters available? */ + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { + + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { + if (free_clusters >= (nclusters + dirty_clusters)) + return 1; + } + + return 0; +} + +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() - check if a block allocation should be retried + * @sb: superblock + * @retries: number of retry attempts made so far + * + * ext4_should_retry_alloc() is called when ENOSPC is returned while + * attempting to allocate blocks. If there's an indication that a pending + * journal transaction might free some space and allow another attempt to + * succeed, this function will wait for the current or committing transaction + * to complete and then return TRUE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!sbi->s_journal) + return 0; + + if (++(*retries) > 3) { + percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); + return 0; + } + + /* + * if there's no indication that blocks are about to be freed it's + * possible we just missed a transaction commit that did so + */ + smp_mb(); + if (atomic_read(&sbi->s_mb_free_pending) == 0) { + if (test_opt(sb, DISCARD)) { + atomic_inc(&sbi->s_retry_alloc_pending); + flush_work(&sbi->s_discard_work); + atomic_dec(&sbi->s_retry_alloc_pending); + } + return ext4_has_free_clusters(sbi, 1, 0); + } + + /* + * it's possible we've just missed a transaction commit here, + * so ignore the returned status + */ + ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); + (void) jbd2_journal_force_commit_nested(sbi->s_journal); + return 1; +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: pointer to total number of clusters needed + * @errp: error code + * + * Return 1st allocated block number on success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + ar.flags = flags; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); + } + return ret; +} + +/** + * ext4_count_free_clusters() -- count filesystem free clusters + * @sb: superblock + * + * Adds up the number of free clusters from each block group. + */ +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_info *grp; +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned int x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; + continue; + } + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_CLUSTERS_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_group_clusters(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (ext4_has_feature_sparse_super2(sb)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; + return 0; + } + if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) + return 1; + if (!(group & 1)) + return 0; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (ext4_has_feature_meta_bg(sb)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!ext4_has_feature_meta_bg(sb) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb_nometa(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb_meta(sb, block_group); + } + return num; +} + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (task_pid_nr(current) % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (task_pid_nr(current) % 16) * + ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4l/bitmap.c b/fs/ext4l/bitmap.c new file mode 100644 index 00000000000..87760fabdd2 --- /dev/null +++ b/fs/ext4l/bitmap.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/buffer_head.h> +#include "ext4.h" + +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); +} + +int ext4_inode_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; + + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + sz = EXT4_INODES_PER_GROUP(sb) >> 3; + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; + + if (!ext4_has_feature_metadata_csum(sb)) + return; + + sz = EXT4_INODES_PER_GROUP(sb) >> 3; + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} + +int ext4_block_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!ext4_has_feature_metadata_csum(sb)) + return; + + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4l/block_validity.c b/fs/ext4l/block_validity.c new file mode 100644 index 00000000000..e8c5525afc6 --- /dev/null +++ b/fs/ext4l/block_validity.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; + u32 ino; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init ext4_init_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_system_zone(void) +{ + rcu_barrier(); + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) + return 1; + return 0; +} + +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count, u32 ino) +{ + struct ext4_system_zone *new_entry, *entry; + struct rb_node **n = &system_blks->root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; + } + + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_entry->ino = ino; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root); + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &system_blks->root); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &system_blks->root); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; + int first = 1; + + printk(KERN_INFO "System zones: "); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->s_system_blks); + node = rb_first(&system_blks->root); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + rcu_read_unlock(); + printk(KERN_CONT "\n"); +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) +{ + struct inode *inode; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_map_blocks map; + u32 i = 0, num; + int err = 0, n; + + if ((ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) + return -EINVAL; + inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + while (i < num) { + cond_resched(); + map.m_lblk = i; + map.m_len = num - i; + n = ext4_map_blocks(NULL, inode, &map, 0); + if (n < 0) { + err = n; + break; + } + if (n == 0) { + i++; + } else { + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + EXT4_ERROR_INODE_ERR(inode, -err, + "blocks %llu-%llu from inode overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1); + } + break; + } + i += n; + } + } + iput(inode); + return err; +} + +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_inode_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; + struct ext4_group_desc *gdp; + ext4_group_t i; + int ret; + + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + + for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + + cond_resched(); + if (meta_blks != 0) { + ret = add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), + meta_blks, 0); + if (ret) + goto err; + } + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1, 0); + if (ret) + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1, 0); + if (ret) + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), + sbi->s_itb_per_group, 0); + if (ret) + goto err; + } + if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { + ret = ext4_protect_reserved_inode(sb, system_blks, + le32_to_cpu(sbi->s_es->s_journal_inum)); + if (ret) + goto err; + } + + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_inode_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->s_system_blks, system_blks); + + if (test_opt(sb, DEBUG)) + debug_print_tree(sbi); + return 0; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; +} + +/* + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_inode_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. + */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct ext4_system_blocks *system_blks; + + system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); + + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); +} + +int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, + ext4_fsblk_t start_blk, unsigned int count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; + struct ext4_system_zone *entry; + struct rb_node *n; + int ret = 1; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; + + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->s_system_blks); + if (system_blks == NULL) + goto out_rcu; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + ret = 0; + if (inode) + ret = (entry->ino == inode->i_ino); + break; + } + } +out_rcu: + rcu_read_unlock(); + return ret; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with some other filesystem metadata blocks. + */ +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, + unsigned int count) +{ + return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count); +} + +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + __le32 *bref = p; + unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) + return 0; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EFSCORRUPTED; + } + } + return 0; +} + diff --git a/fs/ext4l/crypto.c b/fs/ext4l/crypto.c new file mode 100644 index 00000000000..cf0a0970c09 --- /dev/null +++ b/fs/ext4l/crypto.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/quotaops.h> +#include <linux/uuid.h> + +#include "ext4.h" +#include "xattr.h" +#include "ext4_jbd2.h" + +static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + + err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); + + return err; +} + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); + return err; +} + +void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + + ext4_fname_free_ci_filename(fname); +} + +static bool uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return false; + return true; +} + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, err2; + handle_t *handle; + + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto pwsalt_err_journal; + lock_buffer(sbi->s_sbh); + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; +pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + + if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) + return -EFAULT; + return 0; +} + +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle = fs_data; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; + + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + + res = ext4_convert_inline_data(inode); + if (res) + return res; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + } + return res; + } + + res = dquot_initialize(inode); + if (res) + return res; +retry: + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (!res) + res = res2; + return res; +} + +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) +{ + return EXT4_SB(sb)->s_dummy_enc_policy.policy; +} + +static bool ext4_has_stable_inodes(struct super_block *sb) +{ + return ext4_has_feature_stable_inodes(sb); +} + +const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", + .get_context = ext4_get_context, + .set_context = ext4_set_context, + .get_dummy_policy = ext4_get_dummy_policy, + .empty_dir = ext4_empty_dir, + .has_stable_inodes = ext4_has_stable_inodes, +}; diff --git a/fs/ext4l/dir.c b/fs/ext4l/dir.c new file mode 100644 index 00000000000..d4164c507a9 --- /dev/null +++ b/fs/ext4l/dir.c @@ -0,0 +1,693 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/iversion.h> +#include <linux/unicode.h> +#include "ext4.h" +#include "xattr.h" + +static int ext4_dx_readdir(struct file *, struct dir_context *); + +/** + * is_dx_dir() - check if a directory is using htree indexing + * @inode: directory inode + * + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (ext4_has_feature_dir_index(inode->i_sb) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) + return 1; + + return 0; +} + +static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) +{ + /* Check if . or .. , or skip if namelen is 0 */ + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && + (de->name[1] == '.' || de->name[1] == '\0')) + return true; + /* Check if this is a csum entry */ + if (de->file_type == EXT4_FT_DIR_CSUM) + return true; + return false; +} + +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. + */ +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, struct file *filp, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, char *buf, int size, + unsigned int offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; + bool fake = is_fake_dir_entry(de); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); + + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, + fake ? NULL : dir))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(next_offset > size)) + error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, + has_csum ? NULL : dir) && + next_offset != size)) + error_msg = "directory entry too close to block end"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; + else + return 0; + + if (filp) + ext4_error_file(filp, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, size=%d fake=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, size, fake); + else + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, size=%d fake=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, size, fake); + + return 1; +} + +static int ext4_readdir(struct file *file, struct dir_context *ctx) +{ + unsigned int offset; + int i; + struct ext4_dir_entry_2 *de; + int err; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; + + err = fscrypt_prepare_readdir(inode); + if (err) + return err; + + if (is_dx_dir(inode)) { + err = ext4_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; + + /* Can we just clear INDEX flag to ignore htree information? */ + if (!ext4_has_feature_metadata_csum(sb)) { + /* + * We don't set the inode dirty flag since it's not + * critical that it gets flushed back to the disk. + */ + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } + } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + err = ext4_read_inline_dir(file, ctx, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (IS_ENCRYPTED(inode)) { + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); + if (err < 0) + return err; + } + + while (ctx->pos < inode->i_size) { + struct ext4_map_blocks map; + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); + offset = ctx->pos & (sb->s_blocksize - 1); + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err == 0) { + /* m_len should never be zero but let's avoid + * an infinite loop if it somehow is */ + if (map.m_len == 0) + map.m_len = 1; + ctx->pos += map.m_len * sb->s_blocksize; + continue; + } + if (err > 0) { + pgoff_t index = map.m_pblk >> + (PAGE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&file->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_mapping, + &file->f_ra, file, + index, 1); + file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } + } + + if (!bh) { + /* corrupt size? Maybe no more blocks to read */ + if (ctx->pos > inode->i_blocks << 9) + break; + ctx->pos += sb->s_blocksize - offset; + continue; + } + + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirblock_csum_verify(inode, bh)) { + EXT4_ERROR_FILE(file, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; + brelse(bh); + bh = NULL; + continue; + } + set_buffer_verified(bh); + + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (!inode_eq_iversion(inode, info->cookie)) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < ext4_dir_rec_len(1, + inode)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = i; + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) + | offset; + info->cookie = inode_query_iversion(inode); + } + + while (ctx->pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (ext4_check_dir_entry(inode, file, de, bh, + bh->b_data, bh->b_size, + offset)) { + /* + * On error, skip to the next block + */ + ctx->pos = (ctx->pos | + (sb->s_blocksize - 1)) + 1; + break; + } + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + if (le32_to_cpu(de->inode)) { + if (!IS_ENCRYPTED(inode)) { + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); + u32 hash; + u32 minor_hash; + + if (IS_CASEFOLDED(inode)) { + hash = EXT4_DIRENT_HASH(de); + minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hash = 0; + minor_hash = 0; + } + + /* Directory is encrypted */ + err = fscrypt_fname_disk_to_usr(inode, + hash, minor_hash, &de_name, &fstr); + de_name = fstr; + fstr.len = save_len; + if (err) + goto errout; + if (!dir_emit(ctx, + de_name.name, de_name.len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) + goto done; + brelse(bh); + bh = NULL; + } +done: + err = 0; +errout: + fscrypt_fname_free_buffer(&fstr); + brelse(bh); + return err; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return in_compat_syscall(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() calls generic_file_llseek_size to handle htree + * directories, where the "offset" is in terms of the filename hash + * value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond offset limits, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * For non-htree, ext4_llseek already chooses the proper max offset. + */ +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; + int dx_dir = is_dx_dir(inode); + loff_t ret, htree_max = ext4_get_htree_eof(file); + + if (likely(dx_dir)) + ret = generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + ret = ext4_llseek(file, offset, whence); + info->cookie = inode_peek_iversion(inode) - 1; + return ret; +} + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[] __counted_by(name_len); +}; + +/* + * This function implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + + *root = RB_ROOT; +} + +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) +{ + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, ent_name->name, ent_name->len); + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entries on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + + if (!fname) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); + return 0; + } + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); + while (fname) { + if (!dir_emit(ctx, fname->name, + fname->name_len, + fname->inode, + get_dtype(sb, fname->file_type))) { + info->extra_fname = fname; + return 1; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct fname *fname; + int ret = 0; + + ext4_htree_init_dir_info(file, ctx->pos); + + if (ctx->pos == ext4_get_htree_eof(file)) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != ctx->pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(file, ctx, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + !inode_eq_iversion(inode, info->cookie)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + info->cookie = inode_query_iversion(inode); + ret = ext4_htree_fill_tree(file, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + goto finished; + if (ret == 0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(file, ctx, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = ctx->pos; + return ret < 0 ? ret : 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int rlen; + unsigned int offset = 0; + char *top; + + de = buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EFSCORRUPTED; + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EFSCORRUPTED; + + return 0; +} + +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + +const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .iterate_shared = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy extents_status.c, extents_status.h, fsync.c, and hash.c from Linux v6.18 fs/ext4 directory. - extents_status: extent status tree for tracking extent state - fsync: file synchronization operations - hash: directory entry hashing for htree directories Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/extents_status.c | 2306 +++++++++++++++++++++++++++++++++++++ fs/ext4l/extents_status.h | 254 ++++ fs/ext4l/fsync.c | 177 +++ fs/ext4l/hash.c | 323 ++++++ 4 files changed, 3060 insertions(+) create mode 100644 fs/ext4l/extents_status.c create mode 100644 fs/ext4l/extents_status.h create mode 100644 fs/ext4l/fsync.c create mode 100644 fs/ext4l/hash.c diff --git a/fs/ext4l/extents_status.c b/fs/ext4l/extents_status.c new file mode 100644 index 00000000000..31dc0496f8d --- /dev/null +++ b/fs/ext4l/extents_status.c @@ -0,0 +1,2306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Hugh Dickins <hughd@google.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + * Ext4 extents status tree core functions. + */ +#include <linux/list_sort.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "ext4.h" + +#include <trace/events/ext4.h> + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delayed extents in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. + * + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. + */ + +/* + * Extent status tree implementation for ext4. + * + * + * ========================================================================== + * Extent status tree tracks all extent status. + * + * 1. Why we need to implement extent status tree? + * + * Without extent status tree, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. + * + * Let us have a look at how they do without extent status tree. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time consuming. + * + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the extent tree. + * + * + * ========================================================================== + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. + * + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. + * + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. + * + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. + * + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. + * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. + * + * ========================================================================== + * 4. Performance analysis + * + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 5. TODO list + * + * -- Refactor delayed space reservation + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc); +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc); + +int __init ext4_init_es(void) +{ + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u) %llu %x", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) +{ + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t lblk) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (lblk < es->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && lblk < es->es_lblk) + return es; + + if (es && lblk > ext4_es_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. + */ +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + + WARN_ON(es == NULL); + WARN_ON(end < lblk); + + tree = &EXT4_I(inode)->i_es_tree; + + /* see if the extent has been cached */ + es->es_lblk = es->es_len = es->es_pblk = 0; + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; + } + + es1 = __es_tree_search(&tree->root, lblk); + +out: + if (es1 && !matching_fn(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } + if (matching_fn(es1)) + break; + } + } + + if (es1 && matching_fn(es1)) { + WRITE_ONCE(tree->cache_es, es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + es->es_lblk = es->es_len = es->es_pblk = 0; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; +} + +static void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +static void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + +static inline struct pending_reservation *__alloc_pending(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static inline void __free_pending(struct pending_reservation *pr) +{ + kmem_cache_free(ext4_pending_cachep, pr); +} + +/* + * Returns true if we cannot fail to allocate memory for this extent_status + * entry and cannot reclaim it until its status changes. + */ +static inline bool ext4_es_must_keep(struct extent_status *es) +{ + /* fiemap, bigalloc, and seek_data/hole need to use it. */ + if (ext4_es_is_delayed(es)) + return true; + + return false; +} + +static inline struct extent_status *__es_alloc_extent(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, + ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) +{ + es->es_lblk = lblk; + es->es_len = len; + es->es_pblk = pblk; + + /* We never try to reclaim a must kept extent, so we don't count it. */ + if (!ext4_es_must_keep(es)) { + if (!EXT4_I(inode)->i_es_shk_nr++) + ext4_es_list_add(inode); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } + + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); +} + +static inline void __es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); +} + +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) +{ + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + + /* Decrease the shrink counter when we can reclaim the extent. */ + if (!ext4_es_must_keep(es)) { + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + if (!--EXT4_I(inode)->i_es_shk_nr) + ext4_es_list_del(inode); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } + + __es_free_extent(es); +} + +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (ext4_es_type(es1) != ext4_es_type(es2)) + return 0; + + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { + pr_warn("ES assertion failed when merging extents. " + "The sum of lengths of es1 (%d) and es2 (%d) " + "is bigger than allowed file size (%d)\n", + es1->es_len, es2->es_len, EXT_MAX_BLOCKS); + WARN_ON(1); + return 0; + } + + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent */ + if (ext4_es_is_delayed(es1)) + return 1; + + return 0; +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; + if (ext4_es_is_referenced(es)) + ext4_es_set_referenced(es1); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; + if (ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es); + rb_erase(node, &tree->root); + ext4_es_free_extent(inode, es1); + } + + return es; +} + +#ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertion failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + ext4_free_ext_path(path); +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertion failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertion failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertion failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG(); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "We can't find the block but we want to add " + "a written extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); + es = ext4_es_try_to_merge_left(inode, es); + goto out; + } + p = &(*p)->rb_left; + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; + es = ext4_es_try_to_merge_right(inode, es); + goto out; + } + p = &(*p)->rb_right; + } else { + BUG(); + return -EINVAL; + } + } + + if (prealloc) + es = prealloc; + else + es = __es_alloc_extent(false); + if (!es) + return -ENOMEM; + ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, + newes->es_pblk); + + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds information to an inode's extent + * status tree. + */ +void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status, bool delalloc_reserve_used) +{ + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + int err1 = 0, err2 = 0, err3 = 0; + int resv_used = 0, pending = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; + bool revise_pending = false; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); + + if (!len) + return; + + BUG_ON(end < lblk); + WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_insert_extent(inode, &newes); + + ext4_es_insert_extent_check(inode, &newes); + + revise_pending = sbi->s_cluster_ratio > 1 && + test_opt(inode->i_sb, DELALLOC) && + (status & (EXTENT_STATUS_WRITTEN | + EXTENT_STATUS_UNWRITTEN)); +retry: + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3 < 0) && revise_pending && !pr) + pr = __alloc_pending(true); + write_lock(&EXT4_I(inode)->i_es_lock); + + err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); + if (err1 != 0) + goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) + err2 = 0; + if (err2 != 0) + goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } + + if (revise_pending) { + err3 = __revise_pending(inode, lblk, len, &pr); + if (err3 < 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + pending = err3; + } +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + /* + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. + * + * When direct allocating (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by ext4_nonda_switch()) + * an extent either 1) contains delayed blocks but start with + * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed + * allocated blocks which belong to delayed allocated clusters when + * bigalloc feature is enabled, quota has already been claimed by + * ext4_mb_new_blocks(), so release the quota reservations made for + * any previously delayed allocated clusters instead of claim them + * again. + */ + resv_used += pending; + if (resv_used) + ext4_da_update_reserve_space(inode, resv_used, + delalloc_reserve_used); + + if (err1 || err2 || err3 < 0) + goto retry; + + ext4_es_print_tree(inode); + return; +} + +/* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes, NULL); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* + * ext4_es_lookup_extent() looks up an extent in extent status tree. + * + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not + */ +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es) +{ + struct ext4_es_tree *tree; + struct ext4_es_stats *stats; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + if (!ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es1); + percpu_counter_inc(&stats->es_stats_cache_hits); + if (next_lblk) { + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, + rb_node); + *next_lblk = es1->es_lblk; + } else + *next_lblk = 0; + } + } else { + percpu_counter_inc(&stats->es_stats_cache_misses); + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + +struct rsvd_count { + int ndelayed; + bool first_do_lblk_found; + ext4_lblk_t first_do_lblk; + ext4_lblk_t last_do_lblk; + struct extent_status *left_es; + bool partial; + ext4_lblk_t lclu; +}; + +/* + * init_rsvd - initialize reserved count data before removing block range + * in file from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @es - pointer to first extent in range + * @rc - pointer to reserved count data + * + * Assumes es is not NULL + */ +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + + rc->ndelayed = 0; + + /* + * for bigalloc, note the first delayed block in the range has not + * been found, record the extent containing the block to the left of + * the region to be removed, if any, and note that there's no partial + * cluster to track + */ + if (sbi->s_cluster_ratio > 1) { + rc->first_do_lblk_found = false; + if (lblk > es->es_lblk) { + rc->left_es = es; + } else { + node = rb_prev(&es->rb_node); + rc->left_es = node ? rb_entry(node, + struct extent_status, + rb_node) : NULL; + } + rc->partial = false; + } +} + +/* + * count_rsvd - count the clusters containing delayed blocks in a range + * within an extent and add to the running tally in rsvd_count + * + * @inode - file containing extent + * @lblk - first block in range + * @len - length of range in blocks + * @es - pointer to extent containing clusters to be counted + * @rc - pointer to reserved count data + * + * Tracks partial clusters found at the beginning and end of extents so + * they aren't overcounted when they span adjacent extents + */ +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t i, end, nclu; + + if (!ext4_es_is_delayed(es)) + return; + + WARN_ON(len <= 0); + + if (sbi->s_cluster_ratio == 1) { + rc->ndelayed += (int) len; + return; + } + + /* bigalloc */ + + i = (lblk < es->es_lblk) ? es->es_lblk : lblk; + end = lblk + (ext4_lblk_t) len - 1; + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; + + /* record the first block of the first delayed extent seen */ + if (!rc->first_do_lblk_found) { + rc->first_do_lblk = i; + rc->first_do_lblk_found = true; + } + + /* update the last lblk in the region seen so far */ + rc->last_do_lblk = end; + + /* + * if we're tracking a partial cluster and the current extent + * doesn't start with it, count it and stop tracking + */ + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { + rc->ndelayed++; + rc->partial = false; + } + + /* + * if the first cluster doesn't start on a cluster boundary but + * ends on one, count it + */ + if (EXT4_LBLK_COFF(sbi, i) != 0) { + if (end >= EXT4_LBLK_CFILL(sbi, i)) { + rc->ndelayed++; + rc->partial = false; + i = EXT4_LBLK_CFILL(sbi, i) + 1; + } + } + + /* + * if the current cluster starts on a cluster boundary, count the + * number of whole delayed clusters in the extent + */ + if ((i + sbi->s_cluster_ratio - 1) <= end) { + nclu = (end - i + 1) >> sbi->s_cluster_bits; + rc->ndelayed += nclu; + i += nclu << sbi->s_cluster_bits; + } + + /* + * start tracking a partial cluster if there's a partial at the end + * of the current extent and we're not already tracking one + */ + if (!rc->partial && i <= end) { + rc->partial = true; + rc->lclu = EXT4_B2C(sbi, i); + } +} + +/* + * __pr_tree_search - search for a pending cluster reservation + * + * @root - root of pending reservation tree + * @lclu - logical cluster to search for + * + * Returns the pending reservation for the cluster identified by @lclu + * if found. If not, returns a reservation for the next cluster if any, + * and if not, returns NULL. + */ +static struct pending_reservation *__pr_tree_search(struct rb_root *root, + ext4_lblk_t lclu) +{ + struct rb_node *node = root->rb_node; + struct pending_reservation *pr = NULL; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else + return pr; + } + if (pr && lclu < pr->lclu) + return pr; + if (pr && lclu > pr->lclu) { + node = rb_next(&pr->rb_node); + return node ? rb_entry(node, struct pending_reservation, + rb_node) : NULL; + } + return NULL; +} + +/* + * get_rsvd - calculates and returns the number of cluster reservations to be + * released when removing a block range from the extent status tree + * and releases any pending reservations within the range + * + * @inode - file containing block range + * @end - last block in range + * @right_es - pointer to extent containing next block beyond end or NULL + * @rc - pointer to reserved count data + * + * The number of reservations to be released is equal to the number of + * clusters containing delayed blocks within the range, minus the number of + * clusters still containing delayed blocks at the ends of the range, and + * minus the number of pending reservations within the range. + */ +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, + struct extent_status *right_es, + struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + bool left_delayed, right_delayed, count_pending; + struct extent_status *es; + + if (sbi->s_cluster_ratio > 1) { + /* count any remaining partial cluster */ + if (rc->partial) + rc->ndelayed++; + + if (rc->ndelayed == 0) + return 0; + + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); + + /* + * decrease the delayed count by the number of clusters at the + * ends of the range that still contain delayed blocks - + * these clusters still need to be reserved + */ + left_delayed = right_delayed = false; + + es = rc->left_es; + while (es && ext4_es_end(es) >= + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + left_delayed = true; + break; + } + node = rb_prev(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + if (right_es && (!left_delayed || first_lclu != last_lclu)) { + if (end < ext4_es_end(right_es)) { + es = right_es; + } else { + node = rb_next(&right_es->rb_node); + es = node ? rb_entry(node, struct extent_status, + rb_node) : NULL; + } + while (es && es->es_lblk <= + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + right_delayed = true; + break; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, + rb_node); + } + } + + /* + * Determine the block range that should be searched for + * pending reservations, if any. Clusters on the ends of the + * original removed range containing delayed blocks are + * excluded. They've already been accounted for and it's not + * possible to determine if an associated pending reservation + * should be released with the information available in the + * extents status tree. + */ + if (first_lclu == last_lclu) { + if (left_delayed | right_delayed) + count_pending = false; + else + count_pending = true; + } else { + if (left_delayed) + first_lclu++; + if (right_delayed) + last_lclu--; + if (first_lclu <= last_lclu) + count_pending = true; + else + count_pending = false; + } + + /* + * a pending reservation found between first_lclu and last_lclu + * represents an allocated cluster that contained at least one + * delayed block, so the delayed total must be reduced by one + * for each pending reservation found and released + */ + if (count_pending) { + pr = __pr_tree_search(&tree->root, first_lclu); + while (pr && pr->lclu <= last_lclu) { + rc->ndelayed--; + node = rb_next(&pr->rb_node); + rb_erase(&pr->rb_node, &tree->root); + __free_pending(pr); + if (!node) + break; + pr = rb_entry(node, struct pending_reservation, + rb_node); + } + } + } + return rc->ndelayed; +} + + +/* + * __es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @end - last block in range + * @reserved - number of cluster reservations released + * @prealloc - pre-allocated es to avoid memory allocation failures + * + * If @reserved is not NULL and delayed allocation is enabled, counts + * block/cluster reservations freed by removing range and if bigalloc + * enabled cancels pending reservations as needed. Returns 0 on success, + * error code on failure. + */ +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node *node; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2; + ext4_fsblk_t block; + int err = 0; + bool count_reserved = true; + struct rsvd_count rc; + + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) + count_reserved = false; + + es = __es_tree_search(&tree->root, lblk); + if (!es) + goto out; + if (es->es_lblk > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + if (count_reserved) + init_rsvd(inode, lblk, es, &rc); + + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; + if (len1 > 0) + es->es_len = len1; + if (len2 > 0) { + if (len1 > 0) { + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + block = 0x7FDEADBEEFULL; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); + err = __es_insert_extent(inode, &newes, prealloc); + if (err) { + if (!ext4_es_must_keep(&newes)) + return 0; + + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; + goto out; + } + } else { + es->es_lblk = end + 1; + es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } + } + if (count_reserved) + count_rsvd(inode, orig_es.es_lblk + len1, + orig_es.es_len - len1 - len2, &orig_es, &rc); + goto out_get_reserved; + } + + if (len1 > 0) { + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1, + &orig_es, &rc); + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && ext4_es_end(es) <= end) { + if (count_reserved) + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + + len1 = ext4_es_end(es) - end; + if (count_reserved) + count_rsvd(inode, es->es_lblk, orig_len - len1, + es, &rc); + es->es_lblk = end + 1; + es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } + } + +out_get_reserved: + if (count_reserved) + *reserved = get_rsvd(inode, end, es, &rc); +out: + return err; +} + +/* + * ext4_es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + * Reduces block/cluster reservation count and for bigalloc cancels pending + * reservations as needed. + */ +void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + ext4_lblk_t end; + int err = 0; + int reserved = 0; + struct extent_status *es = NULL; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + if (!len) + return; + + end = lblk + len - 1; + BUG_ON(end < lblk); + +retry: + if (err && !es) + es = __es_alloc_extent(true); + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end, &reserved, es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } + write_unlock(&EXT4_I(inode)->i_es_lock); + if (err) + goto retry; + + ext4_es_print_tree(inode); + ext4_da_release_space(inode, reserved); +} + +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) +{ + struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; + ktime_t start_time; + u64 scan_time; + int nr_to_walk; + int nr_shrunk = 0; + int retried = 0, nr_skipped = 0; + + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); + +retry: + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move_tail(&ei->i_es_list, &sbi->s_es_list); + + /* + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. + */ + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { + nr_skipped++; + continue; + } + + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; + continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); + + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); + write_unlock(&ei->i_es_lock); + + if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); + } + spin_unlock(&sbi->s_es_lock); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, try again to scan precached inodes. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + goto retry; + } + + if (locked_ei && nr_shrunk == 0) + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); + +out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, + nr_skipped, retried); + return nr_shrunk; +} + +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = shrink->private_data; + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = shrink->private_data; + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); + + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); + return nr_shrunk; +} + +int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); + seq_printf(seq, " %lld/%lld cache hits/misses\n", + percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), + percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); + if (inode_cnt) + seq_printf(seq, " %d inodes on list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) +{ + int err; + + /* Make sure we have enough bits for physical block number */ + BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); + sbi->s_es_stats.es_stats_shrunk = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, + GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, + GFP_KERNEL); + if (err) + goto err1; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); + if (err) + goto err2; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); + if (err) + goto err3; + + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; + goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); + + return 0; +err4: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); +err3: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); + return err; +} + +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); + shrinker_free(sbi->s_es_shrinker); +} + +/* + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at + * most *nr_to_scan extents, update *nr_to_scan accordingly. + * + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. + * Increment *nr_shrunk by the number of reclaimed extents. Also update + * ei->i_es_shrink_lblk to where we should continue scanning. + */ +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + int *nr_to_scan, int *nr_shrunk) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct extent_status *es; + struct rb_node *node; + + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); + if (!es) + goto out_wrap; + + while (*nr_to_scan > 0) { + if (es->es_lblk > end) { + ei->i_es_shrink_lblk = end + 1; + return 0; + } + + (*nr_to_scan)--; + node = rb_next(&es->rb_node); + + if (ext4_es_must_keep(es)) + goto next; + if (ext4_es_is_referenced(es)) { + ext4_es_clear_referenced(es); + goto next; + } + + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + (*nr_shrunk)++; +next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); + } + ei->i_es_shrink_lblk = es->es_lblk; + return 1; +out_wrap: + ei->i_es_shrink_lblk = 0; + return 0; +} + +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + int nr_shrunk = 0; + ext4_lblk_t start = ei->i_es_shrink_lblk; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (ei->i_es_shk_nr == 0) + return 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && + start != 0) + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); + + ei->i_es_tree.cache_es = NULL; + return nr_shrunk; +} + +/* + * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove + * discretionary entries from the extent status cache. (Some entries + * must be present for proper operations.) + */ +void ext4_clear_inode_es(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct extent_status *es; + struct ext4_es_tree *tree; + struct rb_node *node; + + write_lock(&ei->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + tree->cache_es = NULL; + node = rb_first(&tree->root); + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(node); + if (!ext4_es_must_keep(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + } + } + ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); + write_unlock(&ei->i_es_lock); +} + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * @prealloc - preallocated pending entry + * + * Returns 1 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, + struct pending_reservation **prealloc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + if (likely(*prealloc == NULL)) { + pr = __alloc_pending(false); + if (!pr) { + ret = -ENOMEM; + goto out; + } + } else { + pr = *prealloc; + *prealloc = NULL; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + ret = 1; + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + __free_pending(pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} + +/* + * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents + * status tree, adding a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - start logical block to be added + * @len - length of blocks to be added + * @lclu_allocated/end_allocated - indicates whether a physical cluster has + * been allocated for the logical cluster + * that contains the start/end block. Note that + * end_allocated should always be set to false + * if the start and the end block are in the + * same cluster + */ +void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + int err1 = 0, err2 = 0, err3 = 0; + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; + struct pending_reservation *pr1 = NULL; + struct pending_reservation *pr2 = NULL; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + if (!len) + return; + + WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && + end_allocated); + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); + + ext4_es_insert_extent_check(inode, &newes); + +retry: + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); + if (err1 || err2 || err3 < 0) { + if (lclu_allocated && !pr1) + pr1 = __alloc_pending(true); + if (end_allocated && !pr2) + pr2 = __alloc_pending(true); + } + write_lock(&EXT4_I(inode)->i_es_lock); + + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + if (err1 != 0) + goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 != 0) + goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } + + if (lclu_allocated) { + err3 = __insert_pending(inode, lblk, &pr1); + if (err3 < 0) + goto error; + if (pr1) { + __free_pending(pr1); + pr1 = NULL; + } + } + if (end_allocated) { + err3 = __insert_pending(inode, end, &pr2); + if (err3 < 0) + goto error; + if (pr2) { + __free_pending(pr2); + pr2 = NULL; + } + } +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2 || err3 < 0) + goto retry; + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + return; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * @prealloc - preallocated pending entry + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. Returns number of new + * inserts pending cluster on insert pendings, returns 0 on remove pendings, + * return -ENOMEM on failure. + */ +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + int pendings = 0; + int ret = 0; + + if (len == 0) + return 0; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delayed, + first, lblk - 1); + if (f_del) { + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delayed, + end + 1, last); + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delayed, + first, lblk - 1); + if (f_del) { + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delayed, + end + 1, last); + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else + __remove_pending(inode, last); + } +out: + return (ret < 0) ? ret : pendings; +} diff --git a/fs/ext4l/extents_status.h b/fs/ext4l/extents_status.h new file mode 100644 index 00000000000..8f9c008d11e --- /dev/null +++ b/fs/ext4l/extents_status.h @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +/* + * Besides EXTENT_STATUS_REFERENCED, all these extent type masks + * are exclusive, only one type can be set at a time. + */ +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE)) + +#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status, + bool delalloc_reserve_used); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); + + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4l/fsync.c b/fs/ext4l/fsync.c new file mode 100644 index 00000000000..e476c6de307 --- /dev/null +++ b/fs/ext4l/fsync.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +#include <trace/events/ext4.h> + +/* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static int ext4_sync_parent(struct inode *inode) +{ + struct dentry *dentry, *next; + int ret = 0; + + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + dentry = d_find_any_alias(inode); + if (!dentry) + return 0; + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + + next = dget_parent(dentry); + dput(dentry); + dentry = next; + inode = dentry->d_inode; + + /* + * The directory inode may have gone through rmdir by now. But + * the inode itself and its blocks are still allocated (we hold + * a reference to the inode via its dentry), so it didn't go + * through ext4_evict_inode()) and so we are safe to flush + * metadata blocks and the inode. + */ + ret = sync_mapping_buffers(inode->i_mapping); + if (ret) + break; + ret = sync_inode_metadata(inode, 1); + if (ret) + break; + } + dput(dentry); + return ret; +} + +static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, + int datasync, bool *needs_barrier) +{ + struct inode *inode = file->f_inode; + int ret; + + ret = generic_buffers_fsync_noflush(file, start, end, datasync); + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + /* + * Fastcommit does not really support fsync on directories or other + * special files. Force a full commit. + */ + if (!S_ISREG(inode->i_mode)) + return ext4_force_commit(inode->i_sb); + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return ext4_fc_commit(journal, commit_tid); +} + +/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ +int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret = 0, err; + bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + ASSERT(ext4_journal_current_handle() == NULL); + + trace_ext4_sync_file_enter(file, datasync); + + if (sb_rdonly(inode->i_sb)) + goto out; + + if (!EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fsync_nojournal(file, start, end, datasync, + &needs_barrier); + if (needs_barrier) + goto issue_flush; + goto out; + } + + ret = file_write_and_wait_range(file, start, end); + if (ret) + goto out; + + /* + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for proper transaction to + * commit here. + */ + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); + +issue_flush: + if (needs_barrier) { + err = blkdev_issue_flush(inode->i_sb->s_bdev); + if (!ret) + ret = err; + } +out: + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; + trace_ext4_sync_file_exit(inode, ret); + return ret; +} diff --git a/fs/ext4l/hash.c b/fs/ext4l/hash.c new file mode 100644 index 00000000000..33cd5b6b02d --- /dev/null +++ b/fs/ext4l/hash.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + */ + +#include <linux/fs.h> +#include <linux/unicode.h> +#include <linux/compiler.h> +#include <linux/bitops.h> +#include "ext4.h" + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +/* F, G and H are basic MD4 functions: selection, majority, parity */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +/* + * The generic round function. The application is so specific that + * we don't bother protecting all the arguments with parens, as is generally + * good macro practice, in favor of extra legibility. + * Rotation is separate from addition to prevent recomputation + */ +#define ROUND(f, a, b, c, d, x, s) \ + (a += f(b, c, d) + x, a = rol32(a, s)) +#define K1 0 +#define K2 013240474631UL +#define K3 015666365641UL + +/* + * Basic cut-down MD4 transform. Returns only 32 bits of result. + */ +static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8]) +{ + __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ + ROUND(F, a, b, c, d, in[0] + K1, 3); + ROUND(F, d, a, b, c, in[1] + K1, 7); + ROUND(F, c, d, a, b, in[2] + K1, 11); + ROUND(F, b, c, d, a, in[3] + K1, 19); + ROUND(F, a, b, c, d, in[4] + K1, 3); + ROUND(F, d, a, b, c, in[5] + K1, 7); + ROUND(F, c, d, a, b, in[6] + K1, 11); + ROUND(F, b, c, d, a, in[7] + K1, 19); + + /* Round 2 */ + ROUND(G, a, b, c, d, in[1] + K2, 3); + ROUND(G, d, a, b, c, in[3] + K2, 5); + ROUND(G, c, d, a, b, in[5] + K2, 9); + ROUND(G, b, c, d, a, in[7] + K2, 13); + ROUND(G, a, b, c, d, in[0] + K2, 3); + ROUND(G, d, a, b, c, in[2] + K2, 5); + ROUND(G, c, d, a, b, in[4] + K2, 9); + ROUND(G, b, c, d, a, in[6] + K2, 13); + + /* Round 3 */ + ROUND(H, a, b, c, d, in[3] + K3, 3); + ROUND(H, d, a, b, c, in[7] + K3, 9); + ROUND(H, c, d, a, b, in[2] + K3, 11); + ROUND(H, b, c, d, a, in[6] + K3, 15); + ROUND(H, a, b, c, d, in[1] + K3, 3); + ROUND(H, d, a, b, c, in[5] + K3, 9); + ROUND(H, c, d, a, b, in[0] + K3, 11); + ROUND(H, b, c, d, a, in[4] + K3, 15); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; + + return buf[1]; /* "most hashed" word */ +} +#undef ROUND +#undef K1 +#undef K2 +#undef K3 +#undef F +#undef G +#undef H + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i = 0; i < 4; i++) { + if (hinfo->seed[i]) { + memcpy(buf, hinfo->seed, sizeof(buf)); + break; + } + } + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + fallthrough; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + fallthrough; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + case DX_HASH_SIPHASH: + { + struct qstr qname = QSTR_INIT(name, len); + __u64 combined_hash; + + if (fscrypt_has_encryption_key(dir)) { + combined_hash = fscrypt_fname_siphash(dir, &qname); + } else { + ext4_warning_inode(dir, "Siphash requires key"); + return -1; + } + + hash = (__u32)(combined_hash >> 32); + minor_hash = (__u32)combined_hash; + break; + } + default: + hinfo->hash = 0; + hinfo->minor_hash = 0; + ext4_warning(dir->i_sb, + "invalid/unsupported hash tree version %u", + hinfo->hash_version); + return -EINVAL; + } + hash = hash & ~1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} + +int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo) +{ +#if IS_ENABLED(CONFIG_UNICODE) + const struct unicode_map *um = dir->i_sb->s_encoding; + int r, dlen; + unsigned char *buff; + struct qstr qstr = {.name = name, .len = len }; + + if (len && IS_CASEFOLDED(dir) && + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { + buff = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buff) + return -ENOMEM; + + dlen = utf8_casefold(um, &qstr, buff, PATH_MAX); + if (dlen < 0) { + kfree(buff); + goto opaque_seq; + } + + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); + + kfree(buff); + return r; + } +opaque_seq: +#endif + return __ext4fs_dirhash(dir, name, len, hinfo); +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy ext4_jbd2.c, ext4_jbd2.h, fast_commit.c, and fast_commit.h from Linux v6.18 fs/ext4 directory. - ext4_jbd2: ext4-specific JBD2 journal interface wrappers - fast_commit: fast commit journal optimization Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/ext4_jbd2.c | 408 +++++++ fs/ext4l/ext4_jbd2.h | 461 ++++++++ fs/ext4l/fast_commit.c | 2343 ++++++++++++++++++++++++++++++++++++++++ fs/ext4l/fast_commit.h | 186 ++++ 4 files changed, 3398 insertions(+) create mode 100644 fs/ext4l/ext4_jbd2.c create mode 100644 fs/ext4l/ext4_jbd2.h create mode 100644 fs/ext4l/fast_commit.c create mode 100644 fs/ext4l/fast_commit.h diff --git a/fs/ext4l/ext4_jbd2.c b/fs/ext4l/ext4_jbd2.c new file mode 100644 index 00000000000..a0e66bc1009 --- /dev/null +++ b/fs/ext4l/ext4_jbd2.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Interface between ext4 and JBD + */ + +#include "ext4_jbd2.h" + +#include <trace/events/ext4.h> + +int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + BUG(); +} + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +static int ext4_journal_check_start(struct super_block *sb) +{ + int ret; + journal_t *journal; + + might_sleep(); + + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; + + if (WARN_ON_ONCE(sb_rdonly(sb))) + return -EROFS; + + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); + journal = EXT4_SB(sb)->s_journal; + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (journal && is_journal_aborted(journal)) { + ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); + return -EROFS; + } + return 0; +} + +handle_t *__ext4_journal_start_sb(struct inode *inode, + struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks, + int revoke_creds) +{ + journal_t *journal; + int err; + if (inode) + trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); + else + trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) + return ERR_PTR(err); + + journal = EXT4_SB(sb)->s_journal; + if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return ext4_get_nojournal(); + return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, + GFP_NOFS, type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + + err = handle->h_err; + if (!handle->h_transaction) { + rc = jbd2_journal_stop(handle); + return err ? err : rc; + } + + sb = handle->h_transaction->t_journal->j_private; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type) +{ + struct super_block *sb; + int err; + + if (!ext4_handle_valid(handle)) + return ext4_get_nojournal(); + + sb = handle->h_journal->j_private; + trace_ext4_journal_start_reserved(sb, + jbd2_handle_buffer_credits(handle), _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) { + jbd2_journal_free_reserved(handle); + return ERR_PTR(err); + } + + err = jbd2_journal_start_reserved(handle, type, line); + if (err < 0) + return ERR_PTR(err); + return handle; +} + +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (is_handle_aborted(handle)) + return -EROFS; + if (jbd2_handle_buffer_credits(handle) >= check_cred && + handle->h_revoke_credits >= revoke_cred) + return 0; + extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); + revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); + return ext4_journal_extend(handle, extend_cred, revoke_cred); +} + +static void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + +static void ext4_check_bdev_write_error(struct super_block *sb) +{ + struct address_space *mapping = sb->s_bdev->bd_mapping; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * If the block device has write error flag, it may have failed to + * async write out metadata buffers in the background. In this case, + * we could read old data from disk and write it out again, which + * may lead to on-disk filesystem inconsistency. + */ + if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { + spin_lock(&sbi->s_bdev_wb_lock); + err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); + spin_unlock(&sbi->s_bdev_wb_lock); + if (err) + ext4_error_err(sb, -err, + "Error while async write back metadata"); + } +} + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) +{ + int err; + + might_sleep(); + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + return err; + } + } else + ext4_check_bdev_write_error(sb); + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* + * In the no journal case, we should wait for the ongoing buffer + * to complete and do a forget. + */ + if (!ext4_handle_valid(handle)) { + if (bh) { + clear_buffer_dirty(bh); + wait_on_buffer(bh); + __bforget(bh); + } + return 0; + } + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + return err; + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_error(inode->i_sb, where, line, true, -err, 0, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); + return err; +} + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) +{ + int err; + + if (!ext4_handle_valid(handle)) + return 0; + + err = jbd2_journal_get_create_access(handle, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, bh, handle, + err); + return err; + } + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; +} + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + int err = 0; + + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); + set_buffer_uptodate(bh); + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + /* Errors can only happen due to aborted journal or a nasty bug */ + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), err); + return err; + } + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), + err); + } + } else { + if (inode) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error_inode_err(inode, where, line, + bh->b_blocknr, EIO, + "IO error syncing itable block"); + err = -EIO; + } + } + } + return err; +} diff --git a/fs/ext4l/ext4_jbd2.h b/fs/ext4l/ext4_jbd2.h new file mode 100644 index 00000000000..63d17c5201b --- /dev/null +++ b/fs/ext4l/ext4_jbd2.h @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (ext4_has_feature_extents(sb) ? 20U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +/* + * Number of credits needed if we need to insert an entry into a + * directory. For each new index block, we need 4 blocks (old index + * block, new index block, bitmap block, bg summary). For normal + * htree directories there are 2 levels; if the largedir feature + * enabled it's 3 levels. + */ +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only data block */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ + (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_INIT_REWRITE) : 0) + +#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ + (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC 0 +#define EXT4_HT_INODE 1 +#define EXT4_HT_WRITE_PAGE 2 +#define EXT4_HT_MAP_BLOCKS 3 +#define EXT4_HT_DIR 4 +#define EXT4_HT_TRUNCATE 5 +#define EXT4_HT_QUOTA 6 +#define EXT4_HT_RESIZE 7 +#define EXT4_HT_MIGRATE 8 +#define EXT4_HT_MOVE_EXTENTS 9 +#define EXT4_HT_XATTR 10 +#define EXT4_HT_EXT_CONVERT 11 +#define EXT4_HT_MAX 12 + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +#define ext4_mark_inode_dirty(__h, __i) \ + __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line); + +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc); +/* + * Wrapper functions with which ext4 calls into JBD. + */ +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); + +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) +#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) + +handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, + unsigned int line, int type, int blocks, + int rsv_blocks, int revoke_creds); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); + +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) + +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ +static inline int ext4_handle_valid(handle_t *handle) +{ + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, + int blocks) +{ + /* Freeing each metadata block can result in freeing one cluster */ + return blocks * EXT4_SB(sb)->s_cluster_ratio; +} + +static inline int ext4_trans_default_revoke_credits(struct super_block *sb) +{ + return ext4_free_metadata_revoke_credits(sb, 8); +} + +#define ext4_journal_start_sb(sb, type, nblocks) \ + __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ + ext4_trans_default_revoke_credits(sb)) + +#define ext4_journal_start(inode, type, nblocks) \ + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ + (revoke_creds)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, + unsigned int line, int type, + int blocks, int rsv_blocks, + int revoke_creds) +{ + return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, + rsv_blocks, revoke_creds); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__func__, __LINE__, (handle)) + +#define ext4_journal_start_reserved(handle, type) \ + __ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type); + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks, revoke); + return 0; +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks, + int revoke) +{ + if (ext4_handle_valid(handle)) + return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); + return 0; +} + +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred); + + +/* + * Ensure @handle has at least @check_creds credits available. If not, + * transaction will be extended or restarted to contain at least @extend_cred + * credits. Before restarting transaction @fn is executed to allow for cleanup + * before the transaction is restarted. + * + * The return value is < 0 in case of error, 0 in case the handle has enough + * credits or transaction extension succeeded, 1 in case transaction had to be + * restarted. + */ +#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ + revoke_cred, fn) \ +({ \ + __label__ __ensure_end; \ + int err = __ext4_journal_ensure_credits((handle), (check_cred), \ + (extend_cred), (revoke_cred)); \ + \ + if (err <= 0) \ + goto __ensure_end; \ + err = (fn); \ + if (err < 0) \ + goto __ensure_end; \ + err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ + if (err == 0) \ + err = 1; \ +__ensure_end: \ + err; \ +}) + +/* + * Ensure given handle has at least requested amount of credits available, + * possibly restarting transaction if needed. We also make sure the transaction + * has space for at least ext4_trans_default_revoke_credits(sb) revoke records + * as freeing one or two blocks is very common pattern and requesting this is + * very cheap. + */ +static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, + int revoke_creds) +{ + return ext4_journal_ensure_credits_fn(handle, credits, credits, + revoke_creds, 0); +} + +static inline int ext4_journal_blocks_per_folio(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_folio(inode); + return 0; +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + if (journal) + return jbd2_journal_force_commit(journal); + return 0; +} + +static inline int ext4_jbd2_inode_add_write(handle_t *handle, + struct inode *inode, loff_t start_byte, loff_t length) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_inode_ranged_write(handle, + EXT4_I(inode)->jinode, start_byte, length); + return 0; +} + +static inline int ext4_jbd2_inode_add_wait(handle_t *handle, + struct inode *inode, loff_t start_byte, loff_t length) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_inode_ranged_wait(handle, + EXT4_I(inode)->jinode, start_byte, length); + return 0; +} + +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +int ext4_inode_journal_mode(struct inode *inode); + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; +} + +static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) +{ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 0; + if (!ext4_should_journal_data(inode)) + return 0; + /* + * Data blocks in one extent are contiguous, just account for partial + * clusters at extent boundaries + */ + return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); +} + +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_rwsem for direct I/O reads. This only works for extent-based + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + /* temporary fix to prevent generic/422 test failures */ + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + return 1; +} + +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4l/fast_commit.c b/fs/ext4l/fast_commit.c new file mode 100644 index 00000000000..fa66b08de99 --- /dev/null +++ b/fs/ext4l/fast_commit.c @@ -0,0 +1,2343 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * fs/ext4/fast_commit.c + * + * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> + * + * Ext4 fast commits routines. + */ +#include "ext4.h" +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "mballoc.h" + +#include <linux/lockdep.h> +/* + * Ext4 Fast Commits + * ----------------- + * + * Ext4 fast commits implement fine grained journalling for Ext4. + * + * Fast commits are organized as a log of tag-length-value (TLV) structs. (See + * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by + * TLV during the recovery phase. For the scenarios for which we currently + * don't have replay code, fast commit falls back to full commits. + * Fast commits record delta in one of the following three categories. + * + * (A) Directory entry updates: + * + * - EXT4_FC_TAG_UNLINK - records directory entry unlink + * - EXT4_FC_TAG_LINK - records directory entry link + * - EXT4_FC_TAG_CREAT - records inode and directory entry creation + * + * (B) File specific data range updates: + * + * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode + * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode + * + * (C) Inode metadata (mtime / ctime etc): + * + * - EXT4_FC_TAG_INODE - record the inode that should be replayed + * during recovery. Note that iblocks field is + * not replayed and instead derived during + * replay. + * Commit Operation + * ---------------- + * With fast commits, we maintain all the directory entry operations in the + * order in which they are issued in an in-memory queue. This queue is flushed + * to disk during the commit operation. We also maintain a list of inodes + * that need to be committed during a fast commit in another in memory queue of + * inodes. During the commit operation, we commit in the following order: + * + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following + * section for more details). + * + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. + * + * Fast Commit Ineligibility + * ------------------------- + * + * Not all operations are supported by fast commits today (e.g extended + * attributes). Fast commit ineligibility is marked by calling + * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back + * to full commit. + * + * Atomicity of commits + * -------------------- + * In order to guarantee atomicity during the commit operation, fast commit + * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail + * tag contains CRC of the contents and TID of the transaction after which + * this fast commit should be applied. Recovery code replays fast commit + * logs only if there's at least 1 valid tail present. For every fast commit + * operation, there is 1 tail. This means, we may end up with multiple tails + * in the fast commit space. Here's an example: + * + * - Create a new file A and remove existing file B + * - fsync() + * - Append contents to file A + * - Truncate file A + * - fsync() + * + * The fast commit space at the end of above operations would look like this: + * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] + * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| + * + * Replay code should thus check for all the valid tails in the FC area. + * + * Fast Commit Replay Idempotence + * ------------------------------ + * + * Fast commits tags are idempotent in nature provided the recovery code follows + * certain rules. The guiding principle that the commit path follows while + * committing is that it stores the result of a particular operation instead of + * storing the procedure. + * + * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' + * was associated with inode 10. During fast commit, instead of storing this + * operation as a procedure "rename a to b", we store the resulting file system + * state as a "series" of outcomes: + * + * - Link dirent b to inode 10 + * - Unlink dirent a + * - Inode <10> with valid refcount + * + * Now when recovery code runs, it needs "enforce" this state on the file + * system. This is what guarantees idempotence of fast commit replay. + * + * Let's take an example of a procedure that is not idempotent and see how fast + * commits make it idempotent. Consider following sequence of operations: + * + * rm A; mv B A; read A + * (x) (y) (z) + * + * (x), (y) and (z) are the points at which we can crash. If we store this + * sequence of operations as is then the replay is not idempotent. Let's say + * while in replay, we crash at (z). During the second replay, file A (which was + * actually created as a result of "mv B A" operation) would get deleted. Thus, + * file named A would be absent when we try to read A. So, this sequence of + * operations is not idempotent. However, as mentioned above, instead of storing + * the procedure fast commits store the outcome of each procedure. Thus the fast + * commit log for above procedure would be as follows: + * + * (Let's assume dirent A was linked to inode 10 and dirent B was linked to + * inode 11 before the replay) + * + * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] + * (w) (x) (y) (z) + * + * If we crash at (z), we will have file A linked to inode 11. During the second + * replay, we will remove file A (inode 11). But we will create it back and make + * it point to inode 11. We won't find B, so we'll just skip that step. At this + * point, the refcount for inode 11 is not reliable, but that gets fixed by the + * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled + * similarly. Thus, by converting a non-idempotent procedure into a series of + * idempotent outcomes, fast commits ensured idempotence during the replay. + * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * + * TODOs + * ----- + * + * 0) Fast commit replay path hardening: Fast commit replay code should use + * journal handles to make sure all the updates it does during the replay + * path are atomic. With that if we crash during fast commit replay, after + * trying to do recovery again, we will find a file system where fast commit + * area is invalid (because new full commit would be found). In order to deal + * with that, fast commit replay code should ensure that the "FC_REPLAY" + * superblock state is persisted before starting the replay, so that after + * the crash, fast commit recovery code can look at that flag and perform + * fast commit recovery even if that area is invalidated by later full + * commits. + * + * 1) Handle more ineligible cases. + * + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. + */ + +#include <trace/events/ext4.h> +static struct kmem_cache *ext4_fc_dentry_cachep; + +static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) { + ext4_debug("%s: Block %lld up-to-date", + __func__, bh->b_blocknr); + set_buffer_uptodate(bh); + } else { + ext4_debug("%s: Block %lld not up-to-date", + __func__, bh->b_blocknr); + clear_buffer_uptodate(bh); + } + + unlock_buffer(bh); +} + +static inline void ext4_fc_reset_inode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + ei->i_fc_lblk_start = 0; + ei->i_fc_lblk_len = 0; +} + +void ext4_fc_init_inode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + ext4_fc_reset_inode(inode); + ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); + INIT_LIST_HEAD(&ei->i_fc_list); + INIT_LIST_HEAD(&ei->i_fc_dilist); + init_waitqueue_head(&ei->i_fc_wait); +} + +static bool ext4_fc_disabled(struct super_block *sb) +{ + return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); +} + +/* + * Remove inode from fast commit list. If the inode is being committed + * we wait until inode commit is done. + */ +void ext4_fc_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; + + if (ext4_fc_disabled(inode->i_sb)) + return; + + mutex_lock(&sbi->s_fc_lock); + if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { + mutex_unlock(&sbi->s_fc_lock); + return; + } + + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); + } + list_del_init(&ei->i_fc_list); + + /* + * Since this inode is getting removed, let's also remove all FC + * dentry create references, since it is not needed to log it anyways. + */ + if (list_empty(&ei->i_fc_dilist)) { + mutex_unlock(&sbi->s_fc_lock); + return; + } + + fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); + WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); + list_del_init(&fc_dentry->fcd_list); + list_del_init(&fc_dentry->fcd_dilist); + + WARN_ON(!list_empty(&ei->i_fc_dilist)); + mutex_unlock(&sbi->s_fc_lock); + + release_dentry_name_snapshot(&fc_dentry->fcd_name); + kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); +} + +/* + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. + */ +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; + bool has_transaction = true; + bool is_ineligible; + + if (ext4_fc_disabled(sb)) + return; + + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + if (sbi->s_journal->j_running_transaction) + tid = sbi->s_journal->j_running_transaction->t_tid; + else + has_transaction = false; + read_unlock(&sbi->s_journal->j_state_lock); + } + mutex_lock(&sbi->s_fc_lock); + is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) + sbi->s_fc_ineligible_tid = tid; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + mutex_unlock(&sbi->s_fc_lock); + WARN_ON(reason >= EXT4_FC_REASON_MAX); + sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; +} + +/* + * Generic fast commit tracking function. If this is the first time this we are + * called after a full commit, we initialize fast commit fields and then call + * __fc_track_fn() with update = 0. If we have already been called after a full + * commit, we pass update = 1. Based on that, the track function can determine + * if it needs to track a field for the first time or if it needs to just + * update the previously tracked value. + * + * If enqueue is set, this function enqueues the inode in fast commit list. + */ +static int ext4_fc_track_template( + handle_t *handle, struct inode *inode, + int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), + void *args, int enqueue) +{ + bool update = false; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + tid_t tid = 0; + int ret; + + tid = handle->h_transaction->t_tid; + spin_lock(&ei->i_fc_lock); + if (tid == ei->i_sync_tid) { + update = true; + } else { + ext4_fc_reset_inode(inode); + ei->i_sync_tid = tid; + } + ret = __fc_track_fn(handle, inode, args, update); + spin_unlock(&ei->i_fc_lock); + if (!enqueue) + return ret; + + mutex_lock(&sbi->s_fc_lock); + if (list_empty(&EXT4_I(inode)->i_fc_list)) + list_add_tail(&EXT4_I(inode)->i_fc_list, + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? + &sbi->s_fc_q[FC_Q_STAGING] : + &sbi->s_fc_q[FC_Q_MAIN]); + mutex_unlock(&sbi->s_fc_lock); + + return ret; +} + +struct __track_dentry_update_args { + struct dentry *dentry; + int op; +}; + +/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ +static int __track_dentry_update(handle_t *handle, struct inode *inode, + void *arg, bool update) +{ + struct ext4_fc_dentry_update *node; + struct ext4_inode_info *ei = EXT4_I(inode); + struct __track_dentry_update_args *dentry_update = + (struct __track_dentry_update_args *)arg; + struct dentry *dentry = dentry_update->dentry; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + spin_unlock(&ei->i_fc_lock); + + if (IS_ENCRYPTED(dir)) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, + handle); + spin_lock(&ei->i_fc_lock); + return -EOPNOTSUPP; + } + + node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); + if (!node) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); + spin_lock(&ei->i_fc_lock); + return -ENOMEM; + } + + node->fcd_op = dentry_update->op; + node->fcd_parent = dir->i_ino; + node->fcd_ino = inode->i_ino; + take_dentry_name_snapshot(&node->fcd_name, dentry); + INIT_LIST_HEAD(&node->fcd_dilist); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) + list_add_tail(&node->fcd_list, + &sbi->s_fc_dentry_q[FC_Q_STAGING]); + else + list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); + + /* + * This helps us keep a track of all fc_dentry updates which is part of + * this ext4 inode. So in case the inode is getting unlinked, before + * even we get a chance to fsync, we could remove all fc_dentry + * references while evicting the inode in ext4_fc_del(). + * Also with this, we don't need to loop over all the inodes in + * sbi->s_fc_q to get the corresponding inode in + * ext4_fc_commit_dentry_updates(). + */ + if (dentry_update->op == EXT4_FC_TAG_CREAT) { + WARN_ON(!list_empty(&ei->i_fc_dilist)); + list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); + } + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); + + return 0; +} + +void __ext4_fc_track_unlink(handle_t *handle, + struct inode *inode, struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_UNLINK; + + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_unlink(handle, inode, dentry, ret); +} + +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_unlink(handle, inode, dentry); +} + +void __ext4_fc_track_link(handle_t *handle, + struct inode *inode, struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_LINK; + + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_link(handle, inode, dentry, ret); +} + +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_link(handle, inode, dentry); +} + +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_CREAT; + + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_create(handle, inode, dentry, ret); +} + +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_create(handle, inode, dentry); +} + +/* __track_fn for inode tracking */ +static int __track_inode(handle_t *handle, struct inode *inode, void *arg, + bool update) +{ + if (update) + return -EEXIST; + + EXT4_I(inode)->i_fc_lblk_len = 0; + + return 0; +} + +void ext4_fc_track_inode(handle_t *handle, struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; + int ret; + + if (S_ISDIR(inode->i_mode)) + return; + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_should_journal_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); + return; + } + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); + trace_ext4_fc_track_inode(handle, inode, ret); +} + +struct __track_range_args { + ext4_lblk_t start, end; +}; + +/* __track_fn for tracking data updates */ +static int __track_range(handle_t *handle, struct inode *inode, void *arg, + bool update) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t oldstart; + struct __track_range_args *__arg = + (struct __track_range_args *)arg; + + if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { + ext4_debug("Special inode %ld being modified\n", inode->i_ino); + return -ECANCELED; + } + + oldstart = ei->i_fc_lblk_start; + + if (update && ei->i_fc_lblk_len > 0) { + ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); + ei->i_fc_lblk_len = + max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - + ei->i_fc_lblk_start + 1; + } else { + ei->i_fc_lblk_start = __arg->start; + ei->i_fc_lblk_len = __arg->end - __arg->start + 1; + } + + return 0; +} + +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct __track_range_args args; + int ret; + + if (S_ISDIR(inode->i_mode)) + return; + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + if (ext4_has_inline_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); + return; + } + + args.start = start; + args.end = end; + + ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); + + trace_ext4_fc_track_range(handle, inode, start, end, ret); +} + +static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) +{ + blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; + struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; + + /* Add REQ_FUA | REQ_PREFLUSH only its tail */ + if (test_opt(sb, BARRIER) && is_tail) + write_flags |= REQ_FUA | REQ_PREFLUSH; + lock_buffer(bh); + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = ext4_end_buffer_io_sync; + submit_bh(REQ_OP_WRITE | write_flags, bh); + EXT4_SB(sb)->s_fc_bh = NULL; +} + +/* Ext4 commit path routines */ + +/* + * Allocate len bytes on a fast commit buffer. + * + * During the commit time this function is used to manage fast commit + * block space. We don't split a fast commit log onto different + * blocks. So this function makes sure that if there's not enough space + * on the current block, the remaining space in the current block is + * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, + * new block is from jbd2 and CRC is updated to reflect the padding + * we added. + */ +static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) +{ + struct ext4_fc_tl tl; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh; + int bsize = sbi->s_journal->j_blocksize; + int ret, off = sbi->s_fc_bytes % bsize; + int remaining; + u8 *dst; + + /* + * If 'len' is too long to fit in any block alongside a PAD tlv, then we + * cannot fulfill the request. + */ + if (len > bsize - EXT4_FC_TAG_BASE_LEN) + return NULL; + + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + } + dst = sbi->s_fc_bh->b_data + off; + + /* + * Allocate the bytes in the current block if we can do so while still + * leaving enough space for a PAD tlv. + */ + remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; + if (len <= remaining) { + sbi->s_fc_bytes += len; + return dst; + } + + /* + * Else, terminate the current block with a PAD tlv, then allocate a new + * block and allocate the bytes at the start of that new block. + */ + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + tl.fc_len = cpu_to_le16(remaining); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); + + ext4_fc_submit_bh(sb, false); + + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + sbi->s_fc_bytes += bsize - off + len; + return sbi->s_fc_bh->b_data; +} + +/* + * Complete a fast commit by writing tail tag. + * + * Writing tail tag marks the end of a fast commit. In order to guarantee + * atomicity, after writing tail tag, even if there's space remaining + * in the block, next commit shouldn't use it. That's why tail tag + * has the length as that of the remaining space on the block. + */ +static int ext4_fc_write_tail(struct super_block *sb, u32 crc) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_tl tl; + struct ext4_fc_tail tail; + int off, bsize = sbi->s_journal->j_blocksize; + u8 *dst; + + /* + * ext4_fc_reserve_space takes care of allocating an extra block if + * there's no enough space on this block for accommodating this tail. + */ + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); + if (!dst) + return -ENOSPC; + + off = sbi->s_fc_bytes % bsize; + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); + tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); + sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); + + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); + memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); + dst += sizeof(tail.fc_tid); + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, + dst - (u8 *)sbi->s_fc_bh->b_data); + tail.fc_crc = cpu_to_le32(crc); + memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); + dst += sizeof(tail.fc_crc); + memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ + + ext4_fc_submit_bh(sb, true); + + return 0; +} + +/* + * Adds tag, length, value and updates CRC. Returns true if tlv was added. + * Returns false if there's not enough space. + */ +static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, + u32 *crc) +{ + struct ext4_fc_tl tl; + u8 *dst; + + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); + if (!dst) + return false; + + tl.fc_tag = cpu_to_le16(tag); + tl.fc_len = cpu_to_le16(len); + + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); + + return true; +} + +/* Same as above, but adds dentry tlv. */ +static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, + struct ext4_fc_dentry_update *fc_dentry) +{ + struct ext4_fc_dentry_info fcd; + struct ext4_fc_tl tl; + int dlen = fc_dentry->fcd_name.name.len; + u8 *dst = ext4_fc_reserve_space(sb, + EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); + + if (!dst) + return false; + + fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); + fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); + tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); + tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fcd, sizeof(fcd)); + dst += sizeof(fcd); + memcpy(dst, fc_dentry->fcd_name.name.name, dlen); + + return true; +} + +/* + * Writes inode in the fast commit space under TLV with tag @tag. + * Returns 0 on success, error on failure. + */ +static int ext4_fc_write_inode(struct inode *inode, u32 *crc) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int inode_len = EXT4_GOOD_OLD_INODE_SIZE; + int ret; + struct ext4_iloc iloc; + struct ext4_fc_inode fc_inode; + struct ext4_fc_tl tl; + u8 *dst; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) + inode_len = EXT4_INODE_SIZE(inode->i_sb); + else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) + inode_len += ei->i_extra_isize; + + fc_inode.fc_ino = cpu_to_le32(inode->i_ino); + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); + tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + + ret = -ECANCELED; + dst = ext4_fc_reserve_space(inode->i_sb, + EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); + if (!dst) + goto err; + + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fc_inode, sizeof(fc_inode)); + dst += sizeof(fc_inode); + memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); + ret = 0; +err: + brelse(iloc.bh); + return ret; +} + +/* + * Writes updated data ranges for the inode in question. Updates CRC. + * Returns 0 on success, error otherwise. + */ +static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) +{ + ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_map_blocks map; + struct ext4_fc_add_range fc_ext; + struct ext4_fc_del_range lrange; + struct ext4_extent *ex; + int ret; + + spin_lock(&ei->i_fc_lock); + if (ei->i_fc_lblk_len == 0) { + spin_unlock(&ei->i_fc_lock); + return 0; + } + old_blk_size = ei->i_fc_lblk_start; + new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; + ei->i_fc_lblk_len = 0; + spin_unlock(&ei->i_fc_lock); + + cur_lblk_off = old_blk_size; + ext4_debug("will try writing %d to %d for inode %ld\n", + cur_lblk_off, new_blk_size, inode->i_ino); + + while (cur_lblk_off <= new_blk_size) { + map.m_lblk = cur_lblk_off; + map.m_len = new_blk_size - cur_lblk_off + 1; + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); + if (ret < 0) + return -ECANCELED; + + if (map.m_len == 0) { + cur_lblk_off++; + continue; + } + + if (ret == 0) { + lrange.fc_ino = cpu_to_le32(inode->i_ino); + lrange.fc_lblk = cpu_to_le32(map.m_lblk); + lrange.fc_len = cpu_to_le32(map.m_len); + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, + sizeof(lrange), (u8 *)&lrange, crc)) + return -ENOSPC; + } else { + unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? + EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; + + /* Limit the number of blocks in one extent */ + map.m_len = min(max, map.m_len); + + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); + ex = (struct ext4_extent *)&fc_ext.fc_ex; + ex->ee_block = cpu_to_le32(map.m_lblk); + ex->ee_len = cpu_to_le16(map.m_len); + ext4_ext_store_pblock(ex, map.m_pblk); + if (map.m_flags & EXT4_MAP_UNWRITTEN) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, + sizeof(fc_ext), (u8 *)&fc_ext, crc)) + return -ENOSPC; + } + + cur_lblk_off += map.m_len; + } + + return 0; +} + + +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *ei; + int ret = 0; + + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_submit_inode_data(journal, ei->jinode); + if (ret) + return ret; + } + + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); + if (ret) + return ret; + } + + return 0; +} + +/* Commit all the directory entry updates */ +static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; + struct inode *inode; + struct ext4_inode_info *ei; + int ret; + + if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) + return 0; + list_for_each_entry_safe(fc_dentry, fc_dentry_n, + &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { + if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; + continue; + } + /* + * With fcd_dilist we need not loop in sbi->s_fc_q to get the + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. + */ + if (list_empty(&fc_dentry->fcd_dilist)) + continue; + ei = list_first_entry(&fc_dentry->fcd_dilist, + struct ext4_inode_info, i_fc_dilist); + inode = &ei->vfs_inode; + WARN_ON(inode->i_ino != fc_dentry->fcd_ino); + + /* + * We first write the inode and then the create dirent. This + * allows the recovery code to create an unnamed inode first + * and then link it to a directory entry. This allows us + * to use namei.c routines almost as is and simplifies + * the recovery code. + */ + ret = ext4_fc_write_inode(inode, crc); + if (ret) + return ret; + ret = ext4_fc_write_inode_data(inode, crc); + if (ret) + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; + } + return 0; +} + +static int ext4_fc_perform_commit(journal_t *journal) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct ext4_fc_head head; + struct inode *inode; + struct blk_plug plug; + int ret = 0; + u32 crc = 0; + + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); + + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ + if (ret) + return ret; + + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); + /* + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. + */ + if (journal->j_fs_dev != journal->j_dev) + blkdev_issue_flush(journal->j_fs_dev); + + blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ + if (sbi->s_fc_bytes == 0) { + /* + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. + */ + head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); + head.fc_tid = cpu_to_le32( + sbi->s_journal->j_running_transaction->t_tid); + if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), + (u8 *)&head, &crc)) { + ret = -ENOSPC; + goto out; + } + } + + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); + ret = ext4_fc_commit_dentry_updates(journal, &crc); + if (ret) + goto out; + + /* Step 6.3: Now write all the changed inodes to disk. */ + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + inode = &iter->vfs_inode; + if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + continue; + + ret = ext4_fc_write_inode_data(inode, &crc); + if (ret) + goto out; + ret = ext4_fc_write_inode(inode, &crc); + if (ret) + goto out; + } + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ + ret = ext4_fc_write_tail(sb, crc); + +out: + mutex_unlock(&sbi->s_fc_lock); + blk_finish_plug(&plug); + return ret; +} + +static void ext4_fc_update_stats(struct super_block *sb, int status, + u64 commit_time, int nblks, tid_t commit_tid) +{ + struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; + + ext4_debug("Fast commit ended with status = %d for tid %u", + status, commit_tid); + if (status == EXT4_FC_STATUS_OK) { + stats->fc_num_commits++; + stats->fc_numblks += nblks; + if (likely(stats->s_fc_avg_commit_time)) + stats->s_fc_avg_commit_time = + (commit_time + + stats->s_fc_avg_commit_time * 3) / 4; + else + stats->s_fc_avg_commit_time = commit_time; + } else if (status == EXT4_FC_STATUS_FAILED || + status == EXT4_FC_STATUS_INELIGIBLE) { + if (status == EXT4_FC_STATUS_FAILED) + stats->fc_failed_commits++; + stats->fc_ineligible_commits++; + } else { + stats->fc_skipped_commits++; + } + trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); +} + +/* + * The main commit entry point. Performs a fast commit for transaction + * commit_tid if needed. If it's not possible to perform a fast commit + * due to various reasons, we fall back to full commit. Returns 0 + * on success, error otherwise. + */ +int ext4_fc_commit(journal_t *journal, tid_t commit_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int nblks = 0, ret, bsize = journal->j_blocksize; + int subtid = atomic_read(&sbi->s_fc_subtid); + int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; + ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; + + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return jbd2_complete_transaction(journal, commit_tid); + + trace_ext4_fc_commit_start(sb, commit_tid); + + start_time = ktime_get(); + old_ioprio = get_current_ioprio(); + +restart_fc: + ret = jbd2_fc_begin_commit(journal, commit_tid); + if (ret == -EALREADY) { + /* There was an ongoing commit, check if we need to restart */ + if (atomic_read(&sbi->s_fc_subtid) <= subtid && + tid_gt(commit_tid, journal->j_commit_sequence)) + goto restart_fc; + ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, + commit_tid); + return 0; + } else if (ret) { + /* + * Commit couldn't start. Just update stats and perform a + * full commit. + */ + ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, + commit_tid); + return jbd2_complete_transaction(journal, commit_tid); + } + + /* + * After establishing journal barrier via jbd2_fc_begin_commit(), check + * if we are fast commit ineligible. + */ + if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { + status = EXT4_FC_STATUS_INELIGIBLE; + goto fallback; + } + + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); + fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; + ret = ext4_fc_perform_commit(journal); + if (ret < 0) { + status = EXT4_FC_STATUS_FAILED; + goto fallback; + } + nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; + ret = jbd2_fc_wait_bufs(journal, nblks); + if (ret < 0) { + status = EXT4_FC_STATUS_FAILED; + goto fallback; + } + atomic_inc(&sbi->s_fc_subtid); + ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); + /* + * weight the commit time higher than the average time so we + * don't react too strongly to vast changes in the commit time + */ + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); + return ret; + +fallback: + set_task_ioprio(current, old_ioprio); + ret = jbd2_fc_end_commit_fallback(journal); + ext4_fc_update_stats(sb, status, 0, 0, commit_tid); + return ret; +} + +/* + * Fast commit cleanup routine. This is called after every fast commit and + * full commit. full is true if we are called after a full commit. + */ +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *ei; + struct ext4_fc_dentry_update *fc_dentry; + + if (full && sbi->s_fc_bh) + sbi->s_fc_bh = NULL; + + trace_ext4_fc_cleanup(journal, full, tid); + jbd2_fc_release_bufs(journal); + + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_COMMITTING); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); + } else if (full) { + /* + * We are called after a full commit, inode has been + * modified while the commit was running. Re-enqueue + * the inode into STAGING, which will then be splice + * back into MAIN. This cannot happen during + * fastcommit because the journal is locked all the + * time in that case (and tid doesn't increase so + * tid check above isn't reliable). + */ + list_add_tail(&ei->i_fc_list, + &sbi->s_fc_q[FC_Q_STAGING]); + } + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ + smp_mb(); +#if (BITS_PER_LONG < 64) + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); +#else + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); +#endif + } + + while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { + fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], + struct ext4_fc_dentry_update, + fcd_list); + list_del_init(&fc_dentry->fcd_list); + list_del_init(&fc_dentry->fcd_dilist); + + release_dentry_name_snapshot(&fc_dentry->fcd_name); + kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); + } + + list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], + &sbi->s_fc_dentry_q[FC_Q_MAIN]); + list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], + &sbi->s_fc_q[FC_Q_MAIN]); + + if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } + + if (full) + sbi->s_fc_bytes = 0; + mutex_unlock(&sbi->s_fc_lock); + trace_ext4_fc_stats(sb); +} + +/* Ext4 Replay Path Routines */ + +/* Helper struct for dentry replay routines */ +struct dentry_info_args { + int parent_ino, dname_len, ino, inode_len; + char *dname; +}; + +/* Same as struct ext4_fc_tl, but uses native endianness fields */ +struct ext4_fc_tl_mem { + u16 fc_tag; + u16 fc_len; +}; + +static inline void tl_to_darg(struct dentry_info_args *darg, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct ext4_fc_dentry_info fcd; + + memcpy(&fcd, val, sizeof(fcd)); + + darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); + darg->ino = le32_to_cpu(fcd.fc_ino); + darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); + darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); +} + +static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct ext4_fc_tl tl_disk; + + memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl_disk.fc_len); + tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); +} + +/* Unlink replay function */ +static int ext4_fc_replay_unlink(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct inode *inode, *old_parent; + struct qstr entry; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl, val); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, + darg.parent_ino, darg.dname_len); + + entry.name = darg.dname; + entry.len = darg.dname_len; + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + + if (IS_ERR(inode)) { + ext4_debug("Inode %d not found", darg.ino); + return 0; + } + + old_parent = ext4_iget(sb, darg.parent_ino, + EXT4_IGET_NORMAL); + if (IS_ERR(old_parent)) { + ext4_debug("Dir with inode %d not found", darg.parent_ino); + iput(inode); + return 0; + } + + ret = __ext4_unlink(old_parent, &entry, inode, NULL); + /* -ENOENT ok coz it might not exist anymore. */ + if (ret == -ENOENT) + ret = 0; + iput(old_parent); + iput(inode); + return ret; +} + +static int ext4_fc_replay_link_internal(struct super_block *sb, + struct dentry_info_args *darg, + struct inode *inode) +{ + struct inode *dir = NULL; + struct dentry *dentry_dir = NULL, *dentry_inode = NULL; + struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); + int ret = 0; + + dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR(dir)) { + ext4_debug("Dir with inode %d not found.", darg->parent_ino); + dir = NULL; + goto out; + } + + dentry_dir = d_obtain_alias(dir); + if (IS_ERR(dentry_dir)) { + ext4_debug("Failed to obtain dentry"); + dentry_dir = NULL; + goto out; + } + + dentry_inode = d_alloc(dentry_dir, &qstr_dname); + if (!dentry_inode) { + ext4_debug("Inode dentry not created."); + ret = -ENOMEM; + goto out; + } + + ret = __ext4_link(dir, inode, dentry_inode); + /* + * It's possible that link already existed since data blocks + * for the dir in question got persisted before we crashed OR + * we replayed this tag and crashed before the entire replay + * could complete. + */ + if (ret && ret != -EEXIST) { + ext4_debug("Failed to link\n"); + goto out; + } + + ret = 0; +out: + if (dentry_dir) { + d_drop(dentry_dir); + dput(dentry_dir); + } else if (dir) { + iput(dir); + } + if (dentry_inode) { + d_drop(dentry_inode); + dput(dentry_inode); + } + + return ret; +} + +/* Link replay function */ +static int ext4_fc_replay_link(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct inode *inode; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl, val); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, + darg.parent_ino, darg.dname_len); + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("Inode not found."); + return 0; + } + + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + iput(inode); + return ret; +} + +/* + * Record all the modified inodes during replay. We use this later to setup + * block bitmaps correctly. + */ +static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) +{ + struct ext4_fc_replay_state *state; + int i; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) + if (state->fc_modified_inodes[i] == ino) + return 0; + if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { + int *fc_modified_inodes; + + fc_modified_inodes = krealloc(state->fc_modified_inodes, + sizeof(int) * (state->fc_modified_inodes_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_modified_inodes) + return -ENOMEM; + state->fc_modified_inodes = fc_modified_inodes; + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + } + state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; + return 0; +} + +/* + * Inode replay function + */ +static int ext4_fc_replay_inode(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct ext4_fc_inode fc_inode; + struct ext4_inode *raw_inode; + struct ext4_inode *raw_fc_inode; + struct inode *inode = NULL; + struct ext4_iloc iloc; + int inode_len, ino, ret, tag = tl->fc_tag; + struct ext4_extent_header *eh; + size_t off_gen = offsetof(struct ext4_inode, i_generation); + + memcpy(&fc_inode, val, sizeof(fc_inode)); + + ino = le32_to_cpu(fc_inode.fc_ino); + trace_ext4_fc_replay(sb, tag, ino, 0, 0); + + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (!IS_ERR(inode)) { + ext4_ext_clear_bb(inode); + iput(inode); + } + inode = NULL; + + ret = ext4_fc_record_modified_inode(sb, ino); + if (ret) + goto out; + + raw_fc_inode = (struct ext4_inode *) + (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); + ret = ext4_get_fc_inode_loc(sb, ino, &iloc); + if (ret) + goto out; + + inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); + raw_inode = ext4_raw_inode(&iloc); + + memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); + memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, + inode_len - off_gen); + if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { + eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); + if (eh->eh_magic != EXT4_EXT_MAGIC) { + memset(eh, 0, sizeof(*eh)); + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16( + (sizeof(raw_inode->i_block) - + sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent)); + } + } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { + memcpy(raw_inode->i_block, raw_fc_inode->i_block, + sizeof(raw_inode->i_block)); + } + + /* Immediately update the inode on disk. */ + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + if (ret) + goto out; + ret = sync_dirty_buffer(iloc.bh); + if (ret) + goto out; + ret = ext4_mark_inode_used(sb, ino); + if (ret) + goto out; + + /* Given that we just wrote the inode on disk, this SHOULD succeed. */ + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("Inode not found."); + return -EFSCORRUPTED; + } + + /* + * Our allocator could have made different decisions than before + * crashing. This should be fixed but until then, we calculate + * the number of blocks the inode. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) + ext4_ext_replay_set_iblocks(inode); + + inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); + ext4_reset_inode_seed(inode); + + ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + sync_dirty_buffer(iloc.bh); + brelse(iloc.bh); +out: + iput(inode); + if (!ret) + blkdev_issue_flush(sb->s_bdev); + + return 0; +} + +/* + * Dentry create replay function. + * + * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the + * inode for which we are trying to create a dentry here, should already have + * been replayed before we start here. + */ +static int ext4_fc_replay_create(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *dir = NULL; + struct dentry_info_args darg; + + tl_to_darg(&darg, tl, val); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, + darg.parent_ino, darg.dname_len); + + /* This takes care of update group descriptor and other metadata */ + ret = ext4_mark_inode_used(sb, darg.ino); + if (ret) + goto out; + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("inode %d not found.", darg.ino); + inode = NULL; + ret = -EINVAL; + goto out; + } + + if (S_ISDIR(inode->i_mode)) { + /* + * If we are creating a directory, we need to make sure that the + * dot and dot dot dirents are setup properly. + */ + dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR(dir)) { + ext4_debug("Dir %d not found.", darg.ino); + goto out; + } + ret = ext4_init_new_dir(NULL, dir, inode); + iput(dir); + if (ret) { + ret = 0; + goto out; + } + } + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + if (ret) + goto out; + set_nlink(inode, 1); + ext4_mark_inode_dirty(NULL, inode); +out: + iput(inode); + return ret; +} + +/* + * Record physical disk regions which are in use as per fast commit area, + * and used by inodes during replay phase. Our simple replay phase + * allocator excludes these regions from allocation. + */ +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) +{ + struct ext4_fc_replay_state *state; + struct ext4_fc_alloc_region *region; + + state = &EXT4_SB(sb)->s_fc_replay_state; + /* + * during replay phase, the fc_regions_valid may not same as + * fc_regions_used, update it when do new additions. + */ + if (replay && state->fc_regions_used != state->fc_regions_valid) + state->fc_regions_used = state->fc_regions_valid; + if (state->fc_regions_used == state->fc_regions_size) { + struct ext4_fc_alloc_region *fc_regions; + + fc_regions = krealloc(state->fc_regions, + sizeof(struct ext4_fc_alloc_region) * + (state->fc_regions_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_regions) + return -ENOMEM; + state->fc_regions_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + state->fc_regions = fc_regions; + } + region = &state->fc_regions[state->fc_regions_used++]; + region->ino = ino; + region->lblk = lblk; + region->pblk = pblk; + region->len = len; + + if (replay) + state->fc_regions_valid++; + + return 0; +} + +/* Replay add range tag */ +static int ext4_fc_replay_add_range(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct ext4_fc_add_range fc_add_ex; + struct ext4_extent newex, *ex; + struct inode *inode; + ext4_lblk_t start, cur; + int remaining, len; + ext4_fsblk_t start_pblk; + struct ext4_map_blocks map; + struct ext4_ext_path *path = NULL; + int ret; + + memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); + ex = (struct ext4_extent *)&fc_add_ex.fc_ex; + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, + le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + + inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("Inode not found."); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; + + start = le32_to_cpu(ex->ee_block); + start_pblk = ext4_ext_pblock(ex); + len = ext4_ext_get_actual_len(ex); + + cur = start; + remaining = len; + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + start, start_pblk, len, ext4_ext_is_unwritten(ex), + inode->i_ino); + + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + map.m_pblk = 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) + goto out; + + if (ret == 0) { + /* Range is not mapped */ + path = ext4_find_extent(inode, cur, path, 0); + if (IS_ERR(path)) + goto out; + memset(&newex, 0, sizeof(newex)); + newex.ee_block = cpu_to_le32(cur); + ext4_ext_store_pblock( + &newex, start_pblk + cur - start); + newex.ee_len = cpu_to_le16(map.m_len); + if (ext4_ext_is_unwritten(ex)) + ext4_ext_mark_unwritten(&newex); + down_write(&EXT4_I(inode)->i_data_sem); + path = ext4_ext_insert_extent(NULL, inode, + path, &newex, 0); + up_write((&EXT4_I(inode)->i_data_sem)); + if (IS_ERR(path)) + goto out; + goto next; + } + + if (start_pblk + cur - start != map.m_pblk) { + /* + * Logical to physical mapping changed. This can happen + * if this range was removed and then reallocated to + * map to new physical blocks during a fast commit. + */ + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), + start_pblk + cur - start); + if (ret) + goto out; + /* + * Mark the old blocks as free since they aren't used + * anymore. We maintain an array of all the modified + * inodes. In case these blocks are still used at either + * a different logical range in the same inode or in + * some different inode, we will mark them as allocated + * at the end of the FC replay using our array of + * modified inodes. + */ + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); + goto next; + } + + /* Range is mapped and needs a state change */ + ext4_debug("Converting from %ld to %d %lld", + map.m_flags & EXT4_MAP_UNWRITTEN, + ext4_ext_is_unwritten(ex), map.m_pblk); + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), map.m_pblk); + if (ret) + goto out; + /* + * We may have split the extent tree while toggling the state. + * Try to shrink the extent tree now. + */ + ext4_ext_replay_shrink_inode(inode, start + len); +next: + cur += map.m_len; + remaining -= map.m_len; + } + ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> + sb->s_blocksize_bits); +out: + ext4_free_ext_path(path); + iput(inode); + return 0; +} + +/* Replay DEL_RANGE tag */ +static int +ext4_fc_replay_del_range(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct inode *inode; + struct ext4_fc_del_range lrange; + struct ext4_map_blocks map; + ext4_lblk_t cur, remaining; + int ret; + + memcpy(&lrange, val, sizeof(lrange)); + cur = le32_to_cpu(lrange.fc_lblk); + remaining = le32_to_cpu(lrange.fc_len); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, + le32_to_cpu(lrange.fc_ino), cur, remaining); + + inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; + + ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", + inode->i_ino, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_len)); + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + goto out; + if (ret > 0) { + remaining -= ret; + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); + } else { + remaining -= map.m_len; + cur += map.m_len; + } + } + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_lblk) + + le32_to_cpu(lrange.fc_len) - 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out; + ext4_ext_replay_shrink_inode(inode, + i_size_read(inode) >> sb->s_blocksize_bits); + ext4_mark_inode_dirty(NULL, inode); +out: + iput(inode); + return 0; +} + +static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) +{ + struct ext4_fc_replay_state *state; + struct inode *inode; + struct ext4_ext_path *path = NULL; + struct ext4_map_blocks map; + int i, ret, j; + ext4_lblk_t cur, end; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) { + inode = ext4_iget(sb, state->fc_modified_inodes[i], + EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + ext4_debug("Inode %d not found.", + state->fc_modified_inodes[i]); + continue; + } + cur = 0; + end = EXT_MAX_BLOCKS; + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { + iput(inode); + continue; + } + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + + if (ret > 0) { + path = ext4_find_extent(inode, map.m_lblk, path, 0); + if (!IS_ERR(path)) { + for (j = 0; j < path->p_depth; j++) + ext4_mb_mark_bb(inode->i_sb, + path[j].p_block, 1, true); + } else { + path = NULL; + } + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, + map.m_len, true); + } else { + cur = cur + (map.m_len ? map.m_len : 1); + } + } + iput(inode); + } + + ext4_free_ext_path(path); +} + +/* + * Check if block is in excluded regions for block allocation. The simple + * allocator that runs during replay phase is calls this function to see + * if it is okay to use a block. + */ +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) +{ + int i; + struct ext4_fc_replay_state *state; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_regions_valid; i++) { + if (state->fc_regions[i].ino == 0 || + state->fc_regions[i].len == 0) + continue; + if (in_range(blk, state->fc_regions[i].pblk, + state->fc_regions[i].len)) + return true; + } + return false; +} + +/* Cleanup function called after replay */ +void ext4_fc_replay_cleanup(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_mount_state &= ~EXT4_FC_REPLAY; + kfree(sbi->s_fc_replay_state.fc_regions); + kfree(sbi->s_fc_replay_state.fc_modified_inodes); +} + +static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, + int tag, int len) +{ + switch (tag) { + case EXT4_FC_TAG_ADD_RANGE: + return len == sizeof(struct ext4_fc_add_range); + case EXT4_FC_TAG_DEL_RANGE: + return len == sizeof(struct ext4_fc_del_range); + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + len -= sizeof(struct ext4_fc_dentry_info); + return len >= 1 && len <= EXT4_NAME_LEN; + case EXT4_FC_TAG_INODE: + len -= sizeof(struct ext4_fc_inode); + return len >= EXT4_GOOD_OLD_INODE_SIZE && + len <= sbi->s_inode_size; + case EXT4_FC_TAG_PAD: + return true; /* padding can have any length */ + case EXT4_FC_TAG_TAIL: + return len >= sizeof(struct ext4_fc_tail); + case EXT4_FC_TAG_HEAD: + return len == sizeof(struct ext4_fc_head); + } + return false; +} + +/* + * Recovery Scan phase handler + * + * This function is called during the scan phase and is responsible + * for doing following things: + * - Make sure the fast commit area has valid tags for replay + * - Count number of tags that need to be replayed by the replay handler + * - Verify CRC + * - Create a list of excluded blocks for allocation during replay phase + * + * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is + * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP + * to indicate that scan has finished and JBD2 can now start replay phase. + * It returns a negative error to indicate that there was an error. At the end + * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set + * to indicate the number of tags that need to replayed during the replay phase. + */ +static int ext4_fc_replay_scan(journal_t *journal, + struct buffer_head *bh, int off, + tid_t expected_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_replay_state *state; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_add_range ext; + struct ext4_fc_tl_mem tl; + struct ext4_fc_tail tail; + __u8 *start, *end, *cur, *val; + struct ext4_fc_head head; + struct ext4_extent *ex; + + state = &sbi->s_fc_replay_state; + + start = (u8 *)bh->b_data; + end = start + journal->j_blocksize; + + if (state->fc_replay_expected_off == 0) { + state->fc_cur_tag = 0; + state->fc_replay_num_tags = 0; + state->fc_crc = 0; + state->fc_regions = NULL; + state->fc_regions_valid = state->fc_regions_used = + state->fc_regions_size = 0; + /* Check if we can stop early */ + if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) + != EXT4_FC_TAG_HEAD) + return 0; + } + + if (off != state->fc_replay_expected_off) { + ret = -EFSCORRUPTED; + goto out_err; + } + + state->fc_replay_expected_off++; + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + if (tl.fc_len > end - val || + !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + goto out_err; + } + ext4_debug("Scan phase, tag:%s, blk %lld\n", + tag2str(tl.fc_tag), bh->b_blocknr); + switch (tl.fc_tag) { + case EXT4_FC_TAG_ADD_RANGE: + memcpy(&ext, val, sizeof(ext)); + ex = (struct ext4_extent *)&ext.fc_ex; + ret = ext4_fc_record_regions(sb, + le32_to_cpu(ext.fc_ino), + le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), + ext4_ext_get_actual_len(ex), 0); + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + fallthrough; + case EXT4_FC_TAG_DEL_RANGE: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_INODE: + case EXT4_FC_TAG_PAD: + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + tl.fc_len); + break; + case EXT4_FC_TAG_TAIL: + state->fc_cur_tag++; + memcpy(&tail, val, sizeof(tail)); + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + + offsetof(struct ext4_fc_tail, + fc_crc)); + if (le32_to_cpu(tail.fc_tid) == expected_tid && + le32_to_cpu(tail.fc_crc) == state->fc_crc) { + state->fc_replay_num_tags = state->fc_cur_tag; + state->fc_regions_valid = + state->fc_regions_used; + } else { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -EFSBADCRC; + } + state->fc_crc = 0; + break; + case EXT4_FC_TAG_HEAD: + memcpy(&head, val, sizeof(head)); + if (le32_to_cpu(head.fc_features) & + ~EXT4_FC_SUPPORTED_FEATURES) { + ret = -EOPNOTSUPP; + break; + } + if (le32_to_cpu(head.fc_tid) != expected_tid) { + ret = JBD2_FC_REPLAY_STOP; + break; + } + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + tl.fc_len); + break; + default: + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + } + if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) + break; + } + +out_err: + trace_ext4_fc_replay_scan(sb, ret, off); + return ret; +} + +/* + * Main recovery path entry point. + * The meaning of return codes is similar as above. + */ +static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, + enum passtype pass, int off, tid_t expected_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_tl_mem tl; + __u8 *start, *end, *cur, *val; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; + struct ext4_fc_tail tail; + + if (pass == PASS_SCAN) { + state->fc_current_pass = PASS_SCAN; + return ext4_fc_replay_scan(journal, bh, off, expected_tid); + } + + if (state->fc_current_pass != pass) { + state->fc_current_pass = pass; + sbi->s_mount_state |= EXT4_FC_REPLAY; + } + if (!sbi->s_fc_replay_state.fc_replay_num_tags) { + ext4_debug("Replay stops\n"); + ext4_fc_set_bitmaps_and_counters(sb); + return 0; + } + +#ifdef CONFIG_EXT4_DEBUG + if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { + pr_warn("Dropping fc block %d because max_replay set\n", off); + return JBD2_FC_REPLAY_STOP; + } +#endif + + start = (u8 *)bh->b_data; + end = start + journal->j_blocksize; + + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + + if (state->fc_replay_num_tags == 0) { + ret = JBD2_FC_REPLAY_STOP; + ext4_fc_set_bitmaps_and_counters(sb); + break; + } + + ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); + state->fc_replay_num_tags--; + switch (tl.fc_tag) { + case EXT4_FC_TAG_LINK: + ret = ext4_fc_replay_link(sb, &tl, val); + break; + case EXT4_FC_TAG_UNLINK: + ret = ext4_fc_replay_unlink(sb, &tl, val); + break; + case EXT4_FC_TAG_ADD_RANGE: + ret = ext4_fc_replay_add_range(sb, &tl, val); + break; + case EXT4_FC_TAG_CREAT: + ret = ext4_fc_replay_create(sb, &tl, val); + break; + case EXT4_FC_TAG_DEL_RANGE: + ret = ext4_fc_replay_del_range(sb, &tl, val); + break; + case EXT4_FC_TAG_INODE: + ret = ext4_fc_replay_inode(sb, &tl, val); + break; + case EXT4_FC_TAG_PAD: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, + tl.fc_len, 0); + break; + case EXT4_FC_TAG_TAIL: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, + 0, tl.fc_len, 0); + memcpy(&tail, val, sizeof(tail)); + WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); + break; + case EXT4_FC_TAG_HEAD: + break; + default: + trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); + ret = -ECANCELED; + break; + } + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + } + return ret; +} + +void ext4_fc_init(struct super_block *sb, journal_t *journal) +{ + /* + * We set replay callback even if fast commit disabled because we may + * could still have fast commit blocks that need to be replayed even if + * fast commit has now been turned off. + */ + journal->j_fc_replay_callback = ext4_fc_replay; + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return; + journal->j_fc_cleanup_callback = ext4_fc_cleanup; +} + +static const char * const fc_ineligible_reasons[] = { + [EXT4_FC_REASON_XATTR] = "Extended attributes changed", + [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", + [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", + [EXT4_FC_REASON_NOMEM] = "Insufficient memory", + [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", + [EXT4_FC_REASON_RESIZE] = "Resize", + [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", + [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", + [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", + [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", +}; + +int ext4_fc_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); + struct ext4_fc_stats *stats = &sbi->s_fc_stats; + int i; + + if (v != SEQ_START_TOKEN) + return 0; + + seq_printf(seq, + "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", + stats->fc_num_commits, stats->fc_ineligible_commits, + stats->fc_numblks, + div_u64(stats->s_fc_avg_commit_time, 1000)); + seq_puts(seq, "Ineligible reasons:\n"); + for (i = 0; i < EXT4_FC_REASON_MAX; i++) + seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], + stats->fc_ineligible_reason_count[i]); + + return 0; +} + +int __init ext4_fc_init_dentry_cache(void) +{ + ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, + SLAB_RECLAIM_ACCOUNT); + + if (ext4_fc_dentry_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void ext4_fc_destroy_dentry_cache(void) +{ + kmem_cache_destroy(ext4_fc_dentry_cachep); +} diff --git a/fs/ext4l/fast_commit.h b/fs/ext4l/fast_commit.h new file mode 100644 index 00000000000..3bd534e4dbb --- /dev/null +++ b/fs/ext4l/fast_commit.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __FAST_COMMIT_H__ +#define __FAST_COMMIT_H__ + +/* + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. + */ + +/* Fast commit tags */ +#define EXT4_FC_TAG_ADD_RANGE 0x0001 +#define EXT4_FC_TAG_DEL_RANGE 0x0002 +#define EXT4_FC_TAG_CREAT 0x0003 +#define EXT4_FC_TAG_LINK 0x0004 +#define EXT4_FC_TAG_UNLINK 0x0005 +#define EXT4_FC_TAG_INODE 0x0006 +#define EXT4_FC_TAG_PAD 0x0007 +#define EXT4_FC_TAG_TAIL 0x0008 +#define EXT4_FC_TAG_HEAD 0x0009 + +#define EXT4_FC_SUPPORTED_FEATURES 0x0 + +/* On disk fast commit tlv value structures */ + +/* Fast commit on disk tag length structure */ +struct ext4_fc_tl { + __le16 fc_tag; + __le16 fc_len; +}; + +/* Value structure for tag EXT4_FC_TAG_HEAD. */ +struct ext4_fc_head { + __le32 fc_features; + __le32 fc_tid; +}; + +/* Value structure for EXT4_FC_TAG_ADD_RANGE. */ +struct ext4_fc_add_range { + __le32 fc_ino; + __u8 fc_ex[12]; +}; + +/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */ +struct ext4_fc_del_range { + __le32 fc_ino; + __le32 fc_lblk; + __le32 fc_len; +}; + +/* + * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK + * and EXT4_FC_TAG_UNLINK. + */ +struct ext4_fc_dentry_info { + __le32 fc_parent_ino; + __le32 fc_ino; + __u8 fc_dname[]; +}; + +/* Value structure for EXT4_FC_TAG_INODE. */ +struct ext4_fc_inode { + __le32 fc_ino; + __u8 fc_raw_inode[]; +}; + +/* Value structure for tag EXT4_FC_TAG_TAIL. */ +struct ext4_fc_tail { + __le32 fc_tid; + __le32 fc_crc; +}; + +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + +/* + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: + */ +enum { + EXT4_FC_REASON_XATTR = 0, + EXT4_FC_REASON_CROSS_RENAME, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, + EXT4_FC_REASON_NOMEM, + EXT4_FC_REASON_SWAP_BOOT, + EXT4_FC_REASON_RESIZE, + EXT4_FC_REASON_RENAME_DIR, + EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, + EXT4_FC_REASON_MAX +}; + +#ifdef __KERNEL__ +/* + * In memory list of dentry updates that are performed on the file + * system used by fast commit code. + */ +struct ext4_fc_dentry_update { + int fcd_op; /* Type of update create / unlink / link */ + int fcd_parent; /* Parent inode number */ + int fcd_ino; /* Inode number */ + struct name_snapshot fcd_name; /* Dirent name */ + struct list_head fcd_list; + struct list_head fcd_dilist; +}; + +struct ext4_fc_stats { + unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; + unsigned long fc_num_commits; + unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; + unsigned long fc_numblks; + u64 s_fc_avg_commit_time; +}; + +#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 + +/* + * Physical block regions added to different inodes due to fast commit + * recovery. These are set during the SCAN phase. During the replay phase, + * our allocator excludes these from its allocation. This ensures that + * we don't accidentally allocating a block that is going to be used by + * another inode. + */ +struct ext4_fc_alloc_region { + ext4_lblk_t lblk; + ext4_fsblk_t pblk; + int ino, len; +}; + +/* + * Fast commit replay state. + */ +struct ext4_fc_replay_state { + int fc_replay_num_tags; + int fc_replay_expected_off; + int fc_current_pass; + int fc_cur_tag; + int fc_crc; + struct ext4_fc_alloc_region *fc_regions; + int fc_regions_size, fc_regions_used, fc_regions_valid; + int *fc_modified_inodes; + int fc_modified_inodes_used, fc_modified_inodes_size; +}; + +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) +#endif + +static inline const char *tag2str(__u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "INODE"; + case EXT4_FC_TAG_PAD: + return "PAD"; + case EXT4_FC_TAG_TAIL: + return "TAIL"; + case EXT4_FC_TAG_HEAD: + return "HEAD"; + default: + return "ERROR"; + } +} + +#endif /* __FAST_COMMIT_H__ */ -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy file.c, fsmap.c, fsmap.h, and ialloc.c from Linux v6.18 fs/ext4 directory. - file: file operations (open, read, write, mmap, etc.) - fsmap: filesystem block mapping for GETFSMAP ioctl - ialloc: inode allocation Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/file.c | 995 ++++++++++++++++++++++++++++ fs/ext4l/fsmap.c | 792 ++++++++++++++++++++++ fs/ext4l/fsmap.h | 56 ++ fs/ext4l/ialloc.c | 1621 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 3464 insertions(+) create mode 100644 fs/ext4l/file.c create mode 100644 fs/ext4l/fsmap.c create mode 100644 fs/ext4l/fsmap.h create mode 100644 fs/ext4l/ialloc.c diff --git a/fs/ext4l/file.c b/fs/ext4l/file.c new file mode 100644 index 00000000000..7a8b3093218 --- /dev/null +++ b/fs/ext4l/file.c @@ -0,0 +1,995 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/dax.h> +#include <linux/quotaops.h> +#include <linux/pagevec.h> +#include <linux/uio.h> +#include <linux/mman.h> +#include <linux/backing-dev.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "truncate.h" + +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); + + if (dio_align == 0) + return false; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_should_use_dio(iocb, to)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_read_iter(iocb, to); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + unsigned long blockmask = sb->s_blocksize - 1; + + if ((pos | iov_iter_alignment(from)) & blockmask) + return true; + + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; +} + +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. + */ + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; +} + +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return count; +} + +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = generic_perform_write(iocb, from); + +out: + inode_unlock(inode); + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); +} + +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, ssize_t count) +{ + handle_t *handle; + + lockdep_assert_held_write(&inode->i_rwsem); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (ext4_update_inode_size(inode, offset + written)) { + int ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) { + ext4_journal_stop(handle); + return ret; + } + } + + if ((written == count) && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + return written; +} + +/* + * Clean up the inode after DIO or DAX extending write has completed and the + * inode size has been updated using ext4_handle_inode_extension(). + */ +static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) +{ + lockdep_assert_held_write(&inode->i_rwsem); + if (need_trunc) { + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + return; + } + /* + * If i_disksize got extended either due to writeback of delalloc + * blocks or extending truncate while the DIO was running we could fail + * to cleanup the orphan list in ext4_handle_inode_extension(). Do it + * now. + */ + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + /* + * The write has successfully completed. Not much to + * do with the error here so just cleanup the orphan + * list and hope for the best. + */ + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); + if (error) + return error; + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. Also we can race with truncate or write + * expanding the file so we have to be a bit careful here. + */ + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && + pos + size <= i_size_read(inode)) + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend, + bool *unwritten, int *dio_flags) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + bool overwrite, unaligned_io; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + + unaligned_io = ext4_unaligned_io(inode, from, offset); + *extend = ext4_extending_io(inode, offset, count); + overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + + /* + * Determine whether we need to upgrade to an exclusive lock. This is + * required to change security info in file_modified(), for extending + * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten + * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. + */ + if (*ilock_shared && + ((!IS_NOSEC(inode) || *extend || !overwrite || + (unaligned_io && *unwritten)))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + /* + * Now that locking is settled, determine dio flags and exclusivity + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. + */ + if (!*ilock_shared && (unaligned_io || *extend)) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + if (unaligned_io && (!overwrite || *unwritten)) + inode_dio_wait(inode); + *dio_flags = IOMAP_DIO_FORCE_WAIT; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unwritten = false; + bool ilock_shared = true; + int dio_flags = 0; + + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } + } else { + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); + } + + /* Fallback to buffered I/O if the inode does not support direct I/O. */ + if (!ext4_should_use_dio(iocb, from)) { + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ext4_buffered_write_iter(iocb, from); + } + + /* + * Prevent inline data from being created since we are going to allocate + * blocks for DIO. We know the inode does not currently have inline data + * because ext4_should_use_dio() checked for it, but we have to clear + * the state flag before the write checks because a lock cycle could + * introduce races with other writers. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, + &unwritten, &dio_flags); + if (ret <= 0) + return ret; + + offset = iocb->ki_pos; + count = ret; + + if (extend) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (ret) + goto out; + } + + if (ilock_shared && !unwritten) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + dio_flags, NULL, 0); + if (ret == -ENOTBLK) + ret = 0; + if (extend) { + /* + * We always perform extending DIO write synchronously so by + * now the IO is completed and ext4_handle_inode_extension() + * was called. Cleanup the inode in case of error or race with + * writeback of delalloc blocks. + */ + WARN_ON_ONCE(ret == -EIOCBQUEUED); + ext4_inode_extension_cleanup(inode, ret < 0); + } + +out: + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = iov_iter_count(from); + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + + if (extend) { + ret = ext4_handle_inode_extension(inode, offset, ret, count); + ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); + } +out: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + int ret; + struct inode *inode = file_inode(iocb->ki_filp); + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + else + return ext4_buffered_write_iter(iocb, from); +} + +#ifdef CONFIG_FS_DAX +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + int error = 0; + vm_fault_t result; + int retries = 0; + handle_t *handle = NULL; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for order != 0 (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pfn; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + filemap_invalidate_lock_shared(mapping); +retry: + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } + } else { + filemap_invalidate_lock_shared(mapping); + } + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); + if (write) { + ext4_journal_stop(handle); + + if ((result & VM_FAULT_ERROR) && error == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto retry; + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, order, pfn); + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(sb); + } else { + filemap_invalidate_unlock_shared(mapping); + } + + return result; +} + +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) +{ + return ext4_dax_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .huge_fault = ext4_dax_huge_fault, + .page_mkwrite = ext4_dax_fault, + .pfn_mkwrite = ext4_dax_fault, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + +static const struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap_prepare(struct vm_area_desc *desc) +{ + int ret; + struct file *file = desc->file; + struct inode *inode = file->f_mapping->host; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; + + /* + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. + */ + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) + return -EOPNOTSUPP; + + file_accessed(file); + if (IS_DAX(file_inode(file))) { + desc->vm_ops = &ext4_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; + } else { + desc->vm_ops = &ext4_file_vm_ops; + } + return 0; +} + +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct path path; + char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) + return 0; + + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) + return 0; + + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + err = 0; + if (IS_ERR(cp)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); + if (IS_ERR(handle)) + goto out; + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out_journal; + lock_buffer(sbi->s_sbh); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +out_journal: + ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); + return err; +} + +static int ext4_file_open(struct inode *inode, struct file *filp) +{ + int ret; + + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; + + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; + + ret = fscrypt_file_open(inode, filp); + if (ret) + return ret; + + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (filp->f_mode & FMODE_WRITE) { + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; + } + + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + return dquot_file_open(inode, filp); +} + +/* + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = ext4_get_maxbytes(inode); + + switch (whence) { + default: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_HOLE: + inode_lock_shared(inode); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); + inode_unlock_shared(inode); + break; + case SEEK_DATA: + inode_lock_shared(inode); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); + inode_unlock_shared(inode); + break; + } + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, maxbytes); +} + +const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, + .read_iter = ext4_file_read_iter, + .write_iter = ext4_file_write_iter, + .iopoll = iocb_bio_iopoll, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap_prepare = ext4_file_mmap_prepare, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .get_unmapped_area = thp_get_unmapped_area, + .splice_read = ext4_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, +}; + +const struct inode_operations ext4_file_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_file_getattr, + .listxattr = ext4_listxattr, + .get_inode_acl = ext4_get_acl, + .set_acl = ext4_set_acl, + .fiemap = ext4_fiemap, + .fileattr_get = ext4_fileattr_get, + .fileattr_set = ext4_fileattr_set, +}; + diff --git a/fs/ext4l/fsmap.c b/fs/ext4l/fsmap.c new file mode 100644 index 00000000000..22fc333244e --- /dev/null +++ b/fs/ext4l/fsmap.c @@ -0,0 +1,792 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include "mballoc.h" +#include <linux/sort.h> +#include <linux/list_sort.h> +#include <trace/events/ext4.h> + +/* Convert an ext4_fsmap to an fsmap. */ +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = 0; + dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an ext4_fsmap. */ +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; +} + +/* getfsmap query state */ +struct ext4_getfsmap_info { + struct ext4_fsmap_head *gfi_head; + ext4_fsmap_format_t gfi_formatter; /* formatting fn */ + void *gfi_format_arg;/* format buffer */ + ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ + u32 gfi_dev; /* device id */ + ext4_group_t gfi_agno; /* bg number, if applicable */ + struct ext4_fsmap gfi_low; /* low rmap key */ + struct ext4_fsmap gfi_high; /* high rmap key */ + struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ + struct list_head gfi_meta_list; /* fixed metadata list */ + bool gfi_last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct ext4_getfsmap_dev { + int (*gfd_fn)(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info); + u32 gfd_dev; +}; + +/* Compare two getfsmap device handlers. */ +static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) +{ + const struct ext4_getfsmap_dev *d1 = p1; + const struct ext4_getfsmap_dev *d2 = p2; + + return d1->gfd_dev - d2->gfd_dev; +} + +/* Compare a record against our starting point */ +static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + return rec->fmr_physical + rec->fmr_length <= + info->gfi_low.fmr_physical; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +static int ext4_getfsmap_helper(struct super_block *sb, + struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + struct ext4_fsmap fmr; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t rec_fsblk = rec->fmr_physical; + ext4_group_t agno; + ext4_grpblk_t cno; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (ext4_getfsmap_rec_before_low_key(info, rec)) { + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->gfi_head->fmh_count == 0) { + if (info->gfi_head->fmh_entries == UINT_MAX) + return EXT4_QUERY_RANGE_ABORT; + + if (rec_fsblk > info->gfi_next_fsblk) + info->gfi_head->fmh_entries++; + + if (info->gfi_last) + return EXT4_QUERY_RANGE_CONTINUE; + + info->gfi_head->fmh_entries++; + + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_fsblk > info->gfi_next_fsblk) { + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, + &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, + EXT4_C2B(sbi, cno), + rec_fsblk - info->gfi_next_fsblk, + EXT4_FMR_OWN_UNKNOWN); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = info->gfi_next_fsblk; + fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; + fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + } + + if (info->gfi_last) + goto out; + + /* Fill out the extent we found */ + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), + rec->fmr_length, rec->fmr_owner); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = rec_fsblk; + fmr.fmr_owner = rec->fmr_owner; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + fmr.fmr_length = rec->fmr_length; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + +out: + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; +} + +static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) +{ + return fmr->fmr_physical + fmr->fmr_length; +} + +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* + * Return relevant extents from the meta_list. We emit all extents that + * partially/fully overlap with the query range + */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_end && + p->fmr_physical + p->fmr_length > fs_start) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + +/* Transform a blockgroup's free record into a fsmap */ +static int ext4_getfsmap_datadev_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_fsmap irec; + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb; + ext4_fsblk_t fslen; + int error; + + fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); + fslen = EXT4_C2B(sbi, len); + + /* If the retained free extent record is set... */ + if (info->gfi_lastfree.fmr_owner) { + /* ...and abuts this one, lengthen it and return. */ + if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { + info->gfi_lastfree.fmr_length += fslen; + return 0; + } + + /* + * There's a gap between the two free extents; emit the + * retained extent prior to merging the meta_list. + */ + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + + /* Merge in any relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + } else if (p->fmr_physical < fsb) { + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + + list_del(&p->fmr_list); + kfree(p); + } + } + + irec.fmr_device = 0; + irec.fmr_physical = fsb; + irec.fmr_length = fslen; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + /* If this is a free extent at the end of a bg, buffer it. */ + if (ext4_fsmap_next_pblk(&irec) == + ext4_group_first_block_no(sb, agno + 1)) { + info->gfi_lastfree = irec; + return 0; + } + + /* Otherwise, emit it */ + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Execute a getfsmap query against the log device. */ +static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_fsmap irec; + + /* Set up search keys */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + irec.fmr_physical = journal->j_blk_offset; + irec.fmr_length = journal->j_total_len; + irec.fmr_owner = EXT4_FMR_OWN_LOG; + irec.fmr_flags = 0; + + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Helper to fill out an ext4_fsmap. */ +static inline int ext4_getfsmap_fill(struct list_head *meta_list, + ext4_fsblk_t fsb, ext4_fsblk_t len, + uint64_t owner) +{ + struct ext4_fsmap *fsm; + + fsm = kmalloc(sizeof(*fsm), GFP_NOFS); + if (!fsm) + return -ENOMEM; + fsm->fmr_device = 0; + fsm->fmr_flags = 0; + fsm->fmr_physical = fsb; + fsm->fmr_owner = owner; + fsm->fmr_length = len; + list_add_tail(&fsm->fmr_list, meta_list); + + return 0; +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, + ext4_group_t agno, + struct list_head *meta_list) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); + ext4_fsblk_t len; + unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); + int error; + + /* Record the superblock. */ + if (ext4_bg_has_super(sb, agno)) { + error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); + if (error) + return error; + fsb++; + } + + /* Record the group descriptors. */ + len = ext4_bg_num_gdb(sb, agno); + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_GDT); + if (error) + return error; + fsb += len; + + /* Reserved GDT blocks */ + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { + len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_RESV_GDT); + if (error) + return error; + } + + return 0; +} + +/* Compare two fsmap items. */ +static int ext4_getfsmap_compare(void *priv, + const struct list_head *a, + const struct list_head *b) +{ + struct ext4_fsmap *fa; + struct ext4_fsmap *fb; + + fa = container_of(a, struct ext4_fsmap, fmr_list); + fb = container_of(b, struct ext4_fsmap, fmr_list); + if (fa->fmr_physical < fb->fmr_physical) + return -1; + else if (fa->fmr_physical > fb->fmr_physical) + return 1; + return 0; +} + +/* Merge adjacent extents of fixed metadata. */ +static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *prev = NULL; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + if (!prev) { + prev = p; + continue; + } + + if (prev->fmr_owner == p->fmr_owner && + prev->fmr_physical + prev->fmr_length == p->fmr_physical) { + prev->fmr_length += p->fmr_length; + list_del(&p->fmr_list); + kfree(p); + } else + prev = p; + } +} + +/* Free a list of fixed metadata. */ +static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + list_del(&p->fmr_list); + kfree(p); + } +} + +/* Find all the fixed metadata in the filesystem. */ +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) +{ + struct ext4_group_desc *gdp; + ext4_group_t agno; + int error; + + INIT_LIST_HEAD(meta_list); + + /* Collect everything. */ + for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { + gdp = ext4_get_group_desc(sb, agno, NULL); + if (!gdp) { + error = -EFSCORRUPTED; + goto err; + } + + /* Superblock & GDT */ + error = ext4_getfsmap_find_sb(sb, agno, meta_list); + if (error) + goto err; + + /* Block bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_block_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_BLKBM); + if (error) + goto err; + + /* Inode bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_INOBM); + if (error) + goto err; + + /* Inodes */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group, + EXT4_FMR_OWN_INODES); + if (error) + goto err; + } + + /* Sort the list */ + list_sort(NULL, meta_list, ext4_getfsmap_compare); + + /* Merge adjacent extents */ + ext4_getfsmap_merge_fixed_metadata(meta_list); + + return 0; +err: + ext4_getfsmap_free_fixed_metadata(meta_list); + return error; +} + +/* Execute a getfsmap query against the buddy bitmaps */ +static int ext4_getfsmap_datadev(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start_fsb; + ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; + ext4_fsblk_t eofs; + ext4_group_t start_ag; + ext4_group_t end_ag; + ext4_grpblk_t first_cluster; + ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; + int error = 0; + + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); + eofs = ext4_blocks_count(sbi->s_es); + if (keys[0].fmr_physical >= eofs) + return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + if (keys[1].fmr_physical < keys[0].fmr_physical) + return 0; + start_fsb = keys[0].fmr_physical; + end_fsb = keys[1].fmr_physical; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); + ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); + + /* + * Convert the fsmap low/high keys to bg based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the bg. + */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + /* Assemble a list of all the fixed-location metadata. */ + error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); + if (error) + goto err; + + /* Query each bg */ + for (info->gfi_agno = start_ag; + info->gfi_agno <= end_ag; + info->gfi_agno++) { + /* + * Set the bg high key from the fsmap high key if this + * is the last bg that we're querying. + */ + if (info->gfi_agno == end_ag) { + info->gfi_high = keys[1]; + info->gfi_high.fmr_physical = EXT4_C2B(sbi, + last_cluster); + info->gfi_high.fmr_length = 0; + } + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + error = ext4_mballoc_query_range(sb, info->gfi_agno, + EXT4_B2C(sbi, info->gfi_low.fmr_physical), + EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, + ext4_getfsmap_datadev_helper, info); + if (error) + goto err; + + /* + * Set the bg low key to the start of the bg prior to + * moving on to the next bg. + */ + if (info->gfi_agno == start_ag) + memset(&info->gfi_low, 0, sizeof(info->gfi_low)); + } + + /* Do we have a retained free extent? */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + goto err; + } + + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + info->gfi_last = true; + error = ext4_getfsmap_helper(sb, info, &irec); + if (error) + goto err; + +err: + ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); + return error; +} + +/* Do we recognize the device? */ +static bool ext4_getfsmap_is_valid_device(struct super_block *sb, + struct ext4_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) + return true; + if (EXT4_SB(sb)->s_journal_bdev_file && + fm->fmr_device == + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, + struct ext4_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + return false; +} + +#define EXT4_GETFSMAP_DEVS 2 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide block addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this + * is how we detect gaps in the fsmap + * records and report them. + * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from + * dkeys; used to query the free space. + */ +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg) +{ + struct ext4_fsmap dkeys[2]; /* per-dev keys */ + struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; + struct ext4_getfsmap_info info = { NULL }; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || + !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); + handlers[0].gfd_fn = ext4_getfsmap_datadev; + if (EXT4_SB(sb)->s_journal_bdev_file) { + handlers[1].gfd_dev = new_encode_dev( + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); + handlers[1].gfd_fn = ext4_getfsmap_logdev; + } + + sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), + ext4_getfsmap_dev_compare, NULL); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * Bump the physical offset as there can be no other mapping for + * the same physical block range. + */ + dkeys[0] = head->fmh_keys[0]; + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); + + if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.gfi_formatter = formatter; + info.gfi_format_arg = arg; + info.gfi_head = head; + + /* For each device we support... */ + for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].gfd_fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); + + info.gfi_dev = handlers[i].gfd_dev; + info.gfi_last = false; + info.gfi_agno = -1; + error = handlers[i].gfd_fn(sb, dkeys, &info); + if (error) + break; + info.gfi_next_fsblk = 0; + } + + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/ext4l/fsmap.h b/fs/ext4l/fsmap.h new file mode 100644 index 00000000000..ac642be2302 --- /dev/null +++ b/fs/ext4l/fsmap.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __EXT4_FSMAP_H__ +#define __EXT4_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct ext4_fsmap { + struct list_head fmr_list; + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + uint64_t fmr_length; /* length of segment, blocks */ +}; + +struct ext4_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct ext4_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src); +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *); + +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg); + +#define EXT4_QUERY_RANGE_ABORT 1 +#define EXT4_QUERY_RANGE_CONTINUE 0 + +/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */ +#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ +#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */ + +#endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4l/ialloc.c b/fs/ext4l/ialloc.c new file mode 100644 index 00000000000..ba4fd9aba1c --- /dev/null +++ b/fs/ext4l/ialloc.c @@ -0,0 +1,1621 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/random.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/cred.h> + +#include <asm/byteorder.h> + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +#include <trace/events/ext4.h> + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +static int ext4_validate_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + if (buffer_verified(bh)) + return 0; + + grp = ext4_get_group_info(sb, block_group); + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; + blk = ext4_inode_bitmap(sb, desc); + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + set_buffer_verified(bh); +verified: + ext4_unlock_group(sb, block_group); + return 0; +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + int err; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return ERR_PTR(-EFSCORRUPTED); + + bitmap_blk = ext4_inode_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid inode bitmap blk %llu in " + "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return ERR_PTR(-EFSCORRUPTED); + } + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_warning(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return ERR_PTR(-ENOMEM); + } + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + + ext4_lock_group(sb, block_group); + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Inode bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + trace_ext4_load_inode_bitmap(sb, block_group); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); + if (!buffer_uptodate(bh)) { + put_bh(bh); + ext4_error_err(sb, EIO, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return ERR_PTR(-EIO); + } + +verify: + err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: + put_bh(bh); + return ERR_PTR(err); +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; + + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); + return; + } + if (icount_read(inode) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + icount_read(inode)); + return; + } + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + trace_ext4_free_inode(inode); + + dquot_initialize(inode); + dquot_free_inode(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + ext4_clear_inode(inode); + + es = sbi->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + /* Don't bother if the inode bitmap is corrupt. */ + if (IS_ERR(bitmap_bh)) { + fatal = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + goto error_return; + } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, block_group); + if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; + goto error_return; + } + } + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (fatal) + goto error_return; + + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, sb, bh2, + EXT4_JTR_NONE); + } + ext4_lock_group(sb, block_group); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } + + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + if (percpu_counter_initialized(&sbi->s_dirs_counter)) + percpu_counter_dec(&sbi->s_dirs_counter); + } + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); + + if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + struct flex_groups *fg; + + fg = sbi_array_rcu_deref(sbi, s_flex_groups, + ext4_flex_group(sbi, block_group)); + atomic_inc(&fg->free_inodes); + if (is_directory) + atomic_dec(&fg->used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + } else { + ext4_error(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +struct orlov_stats { + __u64 free_clusters; + __u32 free_inodes; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + + if (flex_size > 1) { + struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), + s_flex_groups, g); + stats->free_inodes = atomic_read(&fg->free_inodes); + stats->free_clusters = atomic64_read(&fg->free_clusters); + stats->used_dirs = atomic_read(&fg->used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_clusters = 0; + stats->used_dirs = 0; + } +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free clusters counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free clusters left (min_clusters) or + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode, + const struct qstr *qstr) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t real_ngroups = ext4_get_groups_count(sb); + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei, grp_free; + ext4_fsblk_t freec, avefreec; + unsigned int ndirs; + int max_dirs, min_inodes; + ext4_grpblk_t min_clusters; + ext4_group_t i, grp, g, ngroups; + struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; + + ngroups = real_ngroups; + if (flex_size > 1) { + ngroups = (real_ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); + avefreec = freec; + do_div(avefreec, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if (S_ISDIR(mode) && + ((parent == d_inode(sb->s_root)) || + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); + parent_group = hinfo.hash % ngroups; + } else + parent_group = get_random_u32_below(ngroups); + for (i = 0; i < ngroups; i++) { + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) + continue; + if (stats.used_dirs >= best_ndir) + continue; + if (stats.free_inodes < avefreei) + continue; + if (stats.free_clusters < avefreec) + continue; + grp = g; + ret = 0; + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= real_ngroups) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + if (min_clusters < 0) + min_clusters = 0; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } + + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) + continue; + if (stats.free_inodes < min_inodes) + continue; + if (stats.free_clusters < min_clusters) + continue; + goto found_flex_bg; + } + +fallback: + ngroups = real_ngroups; + avefreei = freei / ngroups; +fallback_retry: + parent_group = EXT4_I(parent)->i_block_group; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } + } + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback_retry; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode, NULL); + } + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) + return 0; + } + + return -1; +} + +/* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 60 +#define RECENTCY_DIRTY 300 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (!bh || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ + dtime = le32_to_cpu(raw_inode->i_dtime); + now = ktime_get_real_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ + bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; + unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); + +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + goto not_found; + + if (check_recently_deleted && recently_deleted(sb, group, *ino)) { + recently_deleted_ino = *ino; + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + goto not_found; + } + return 1; +not_found: + if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + /* + * Not reusing recently deleted inodes is mostly a preference. We don't + * want to report ENOSPC or skew allocation patterns because of that. + * So return even recently deleted inode if we could find better in the + * given range. + */ + *ino = recently_deleted_ino; + return 1; +} + +int ext4_mark_inode_used(struct super_block *sb, int ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; + struct ext4_group_desc *gdp; + ext4_group_t group; + int bit; + int err; + + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + return -EFSCORRUPTED; + + group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (IS_ERR(inode_bitmap_bh)) + return PTR_ERR(inode_bitmap_bh); + + if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { + err = 0; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) { + err = -EINVAL; + goto out; + } + + ext4_set_bit(bit, inode_bitmap_bh->b_data); + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + err = sync_dirty_buffer(inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); + sync_dirty_buffer(block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (bit >= free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - bit - 1)); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + + ext4_unlock_group(sb, group); + err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); + sync_dirty_buffer(group_desc_bh); +out: + brelse(inode_bitmap_bh); + return err; +} + +static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, + bool encrypt) +{ + struct super_block *sb = dir->i_sb; + int nblocks = 0; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) + return PTR_ERR(p); + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never more than 1k. + * In practice they are under 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, + NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + return nblocks; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *__ext4_new_inode(struct mnt_idmap *idmap, + handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) +{ + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; + ext4_group_t ngroups, group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err; + struct inode *ret; + ext4_group_t i; + ext4_group_t flex_group; + struct ext4_group_info *grp = NULL; + bool encrypt = false; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); + + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + + /* + * Initialize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode_fsuid_set(inode, idmap); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(idmap, inode, dir, mode); + + if (ext4_has_feature_project(sb) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + + if (!(i_flags & EXT4_EA_INODE_FL)) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto out; + } + + err = dquot_initialize(inode); + if (err) + goto out; + + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { + ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); + if (ret2 < 0) { + err = ret2; + goto out; + } + nblocks += ret2; + } + + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else + ret2 = find_group_other(sb, dir, &group, mode); + +got_group: + EXT4_I(dir)->i_last_alloc_group = group; + err = -ENOSPC; + if (ret2 == -1) + goto out; + + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ + for (i = 0; i < ngroups; i++, ino = 0) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + /* + * Skip groups with already-known suspicious inode + * tables + */ + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + } + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + /* Skip groups with suspicious inode tables */ + if (IS_ERR(inode_bitmap_bh)) { + inode_bitmap_bh = NULL; + goto next_group; + } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY) && + EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) + goto next_group; + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + goto next_group; + } + + if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; + } + } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, + EXT4_JTR_NONE); + if (err) { + ext4_std_error(sb, err); + goto out; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + +next_group: + if (++group == ngroups) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); + goto out; + } + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, + EXT4_JTR_NONE); + if (err) { + brelse(block_bitmap_bh); + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + struct ext4_group_info *grp = NULL; + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + if (!grp) { + err = -EFSCORRUPTED; + goto out; + } + down_read(&grp->alloc_sem); /* + * protect vs itable + * lazyinit + */ + } + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, + f)->used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_inodes); + } + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* Don't inherit extent flag from directory, amongst others. */ + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; + + ext4_set_inode_flags(inode, true); + if (IS_DIRSYNC(inode)) + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + goto out; + } + inode->i_generation = get_random_u32(); + + /* Precompute checksum seed for inode metadata */ + if (ext4_has_feature_metadata_csum(sb)) { + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); + } + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = sbi->s_want_extra_isize; + ei->i_inline_off = 0; + if (ext4_has_feature_inline_data(sb) && + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_set_context(inode, handle); + if (err) + goto fail_free_drop; + } + + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } + + if (ext4_has_feature_extents(sb)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + + ext4_set_inode_mapping_order(inode); + + ext4_update_inode_fsync_trans(handle, inode, 1); + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_ext4_allocate_inode(inode, dir, mode); + brelse(inode_bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); +fail_drop: + clear_nlink(inode); + unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + brelse(inode_bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh = NULL; + struct inode *inode = NULL; + int err = -EFSCORRUPTED; + + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (IS_ERR(bitmap_bh)) + return ERR_CAST(bitmap_bh); + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error_err(sb, -err, + "couldn't read orphan inode %lu (err %d)", + ino, err); + brelse(bitmap_bh); + return inode; + } + + /* + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. + */ + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +bad_orphan: + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + if (inode) { + printk(KERN_ERR "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; + continue; + } + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += ext4_used_dirs_count(sb, gdp); + } + return count; +} + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_new_inode() until we are finished. + */ +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + unsigned long used_inos = 0; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp || !grp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + used_inos = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); + + /* Bogus inode unused count? */ + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + used_inos += group * EXT4_INODES_PER_GROUP(sb); + /* + * Are there some uninitialized inodes in the inode table + * before the first normal inode? + */ + if ((used_blks != sbi->s_itb_per_group) && + (used_inos < EXT4_FIRST_INO(sb))) { + ext4_error(sb, "Something is wrong with group %u: " + "itable unused count: %u; " + "itables initialized count: %ld", + group, ext4_itable_unused_count(sb, gdp), + used_inos); + ret = 1; + goto err_out; + } + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy indirect.c and inline.c from Linux v6.18 fs/ext4 directory. - indirect: indirect block mapping for ext2/ext3 compatibility - inline: inline data support for small files stored in inode Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/indirect.c | 1474 ++++++++++++++++++++++++++++++++ fs/ext4l/inline.c | 1982 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 3456 insertions(+) create mode 100644 fs/ext4l/indirect.c create mode 100644 fs/ext4l/inline.c diff --git a/fs/ext4l/indirect.c b/fs/ext4l/indirect.c new file mode 100644 index 00000000000..da76353b3a5 --- /dev/null +++ b/fs/ext4l/indirect.c @@ -0,0 +1,1474 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include "ext4_jbd2.h" +#include "truncate.h" +#include <linux/dax.h> +#include <linux/uio.h> + +#include <trace/events/ext4.h> + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + unsigned int key; + int ret = -EIO; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + key = le32_to_cpu(p->key); + if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { + /* the block was out of range */ + ret = -EFSCORRUPTED; + goto failure; + } + bh = sb_getblk(sb, key); + if (unlikely(!bh)) { + ret = -ENOMEM; + goto failure; + } + + if (!bh_uptodate_or_lock(bh)) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = ret; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_branch() - allocate and set up a chain of blocks + * @handle: handle for this transaction + * @ar: structure describing the allocation request + * @indirect_blks: number of allocated indirect blocks + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, + struct ext4_allocation_request *ar, + int indirect_blks, ext4_lblk_t *offsets, + Indirect *branch) +{ + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); + } else { + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, + ar->inode, ar->goal, + ar->flags & EXT4_MB_DELALLOC_RESERVED, + NULL, &err); + /* Simplify error cleanup... */ + branch[i+1].bh = NULL; + } + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, ar->inode->i_sb, + bh, EXT4_JTR_NONE); + if (err) { + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar->len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); + if (err) + goto failed; + } + return 0; +failed: + if (i == indirect_blks) { + /* Free data blocks */ + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + ar->len, 0); + i--; + } + for (; i >= 0; i--) { + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i] is at branch[i+1].bh + * (buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called). Also + * because blocks are freshly allocated, we don't need to + * revoke them which is why we don't set + * EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, ar->inode, branch[i+1].bh, + new_blocks[i], 1, + branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); + } + return err; +} + +/** + * ext4_splice_branch() - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @ar: structure describing the allocation request + * @where: location of missing link + * @num: number of indirect blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, + struct ext4_allocation_request *ar, + Indirect *where, int num) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, ar->inode->i_sb, + where->bh, EXT4_JTR_NONE); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && ar->len > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < ar->len; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + ext4_debug("splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + err = ext4_mark_inode_dirty(handle, ar->inode); + if (unlikely(err)) + goto err_out; + ext4_debug("splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), + ar->len, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + struct ext4_allocation_request ar; + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + u64 count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup failed */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); + int i; + + /* + * Count number blocks in a subtree under 'partial'. At each + * level we count number of complete empty subtrees beyond + * current offset and then descend into the subtree only + * partially beyond current offset. + */ + count = 0; + for (i = partial - chain + 1; i < depth; i++) + count = count * epb + (epb - offsets[i] - 1); + count++; + /* Fill in size of a hole we found */ + map->m_pblk = 0; + map->m_len = umin(map->m_len, count); + goto cleanup; + } + + /* Failed read of indirect block */ + if (err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + if (ext4_has_feature_bigalloc(inode->i_sb)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + err = -EFSCORRUPTED; + goto out; + } + + /* Set up for the direct block allocation */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + ar.len = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, &ar, indirect_blks, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); + count = ar.len; + +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); + return err; +} + +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) +{ + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; +} + +static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int *dropped) +{ + int err; + + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + return err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + return err; + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_rwsem. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a convenient checkpoint to make + * sure we don't overflow the journal. + * + * Try to extend this transaction for the purposes of truncation. If + * extend fails, we restart transaction. + */ +static int ext4_ind_truncate_ensure_credits(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + int revoke_creds) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_blocks_for_truncate(inode), revoke_creds, + ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + if (ret <= 0) + return ret; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (unlikely(ret)) + return ret; + } + return 0; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; + + if (!ext4_inode_block_valid(inode, block_to_free, count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + err = ext4_ind_truncate_ensure_credits(handle, inode, bh, + ext4_free_data_revoke_credits(inode, count)); + if (err < 0) + goto out_err; + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + this_bh, EXT4_JTR_NONE); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_inode_block_valid(inode, nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = ext4_sb_bread_nofail(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (IS_ERR(bh)) { + ext4_error_inode_block(inode, nr, -PTR_ERR(bh), + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (ext4_ind_truncate_ensure_credits(handle, inode, + NULL, + ext4_free_metadata_revoke_credits( + inode->i_sb, 1)) < 0) + return; + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + inode->i_sb, parent_bh, + EXT4_JTR_NONE)) { + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(handle_t *handle, struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + return; + } + + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + return; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + fallthrough; + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + fallthrough; + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + fallthrough; + case EXT4_TIND_BLOCK: + ; + } +} + +/** + * ext4_ind_remove_space - remove space from the range + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @start: First block to remove + * @end: One block after the last block to remove (exclusive) + * + * Free the blocks in the defined range (end is exclusive endpoint of + * range). This is used by ext4_punch_hole(). + */ +int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + ext4_lblk_t offsets[4], offsets2[4]; + Indirect chain[4], chain2[4]; + Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; + ext4_lblk_t max_block; + __le32 nr = 0, nr2 = 0; + int n = 0, n2 = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + if (end >= max_block) + end = max_block; + if ((start >= end) || (start > max_block)) + return 0; + + n = ext4_block_to_path(inode, start, offsets, NULL); + n2 = ext4_block_to_path(inode, end, offsets2, NULL); + + BUG_ON(n > n2); + + if ((n == 1) && (n == n2)) { + /* We're punching only within direct block range */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + offsets2[0]); + return 0; + } else if (n2 > n) { + /* + * Start and end are on a different levels so we're going to + * free partial block at start, and partial block at end of + * the range. If there are some levels in between then + * do_indirects label will take care of that. + */ + + if (n == 1) { + /* + * Start is at the direct block level, free + * everything to the end of the level. + */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto end_range; + } + + + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + partial--; + } + +end_range: + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + if (nr2) { + if (partial2 == chain2) { + /* + * Remember, end is exclusive so here we're at + * the start of the next level we're not going + * to free. Everything was covered by the start + * of the range. + */ + goto do_indirects; + } + } else { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + while (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + partial2--; + } + goto do_indirects; + } + + /* Punch happened within the same level (n == n2) */ + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + + /* Free top, but only if partial2 isn't its subtree. */ + if (nr) { + int level = min(partial - chain, partial2 - chain2); + int i; + int subtree = 1; + + for (i = 0; i <= level; i++) { + if (offsets[i] != offsets2[i]) { + subtree = 0; + break; + } + } + + if (!subtree) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, + (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, + (chain+n-1) - partial); + } + } + } + + if (!nr2) { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + } + + while (partial > chain || partial2 > chain2) { + int depth = (chain+n-1) - partial; + int depth2 = (chain2+n2-1) - partial2; + + if (partial > chain && partial2 > chain2 && + partial->bh->b_blocknr == partial2->bh->b_blocknr) { + /* + * We've converged on the same block. Clear the range, + * then we're done. + */ + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + partial2->p, + (chain+n-1) - partial); + goto cleanup; + } + + /* + * The start and end partial branches may not be at the same + * level even though the punch happened within one level. So, we + * give them a chance to arrive at the same level, then walk + * them in step with each other until we converge on the same + * block. + */ + if (partial > chain && depth <= depth2) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + partial--; + } + if (partial2 > chain2 && depth2 <= depth) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + partial2--; + } + } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } + return 0; + +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + if (++n >= n2) + break; + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + fallthrough; + case EXT4_IND_BLOCK: + if (++n >= n2) + break; + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + fallthrough; + case EXT4_DIND_BLOCK: + if (++n >= n2) + break; + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + fallthrough; + case EXT4_TIND_BLOCK: + ; + } + goto cleanup; +} diff --git a/fs/ext4l/inline.c b/fs/ext4l/inline.c new file mode 100644 index 00000000000..1b094a4f386 --- /dev/null +++ b/fs/ext4l/inline.c @@ -0,0 +1,1982 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma <boyu.mt@taobao.com> + */ + +#include <linux/iomap.h> +#include <linux/fiemap.h> +#include <linux/namei.h> +#include <linux/iversion.h> +#include <linux/sched/mm.h> + +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "truncate.h" + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 + + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + +static int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + void *end; + int free, min_offs; + + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return 0; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + + /* Compute min_offs. */ + while (!IS_LAST_ENTRY(entry)) { + void *next = EXT4_XATTR_NEXT(entry); + + if (next >= end) { + EXT4_ERROR_INODE(inode, + "corrupt xattr in inline inode"); + return 0; + } + if (!entry->e_value_inum && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + entry = next; + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + if (is.s.here->e_value_inum) { + EXT4_ERROR_INODE(inode, "inline data xattr refers " + "to an external xattr inode"); + error = -EFSCORRUPTED; + goto out; + } + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_set. + * That saves us one memcpy. + */ +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + if (unlikely(ext4_emergency_state(inode->i_sb))) + return; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = EXT4_ZERO_XATTR_VALUE; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, "missing inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) { + error = -ENOMEM; + goto out; + } + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error < 0) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); + if (error) + goto out; + + /* Update the xattr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + loff_t len) +{ + int ret, size, no_expand; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + ext4_write_lock_xattr(inode, &no_expand); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + ext4_write_unlock_xattr(inode, &no_expand); + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); + if (error) + goto out; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (ext4_has_feature_extents(inode->i_sb)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(folio->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + BUG_ON(len > PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + kaddr = folio_zero_tail(folio, len, kaddr + len); + kunmap_local(kaddr); + folio_mark_uptodate(folio); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct folio *folio) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!folio->index) + ret = ext4_read_inline_folio(inode, folio); + else if (!folio_test_uptodate(folio)) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + folio_unlock(folio); + return ret >= 0 ? 0 : ret; +} + +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode) +{ + int ret, needed_blocks, no_expand; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct folio *folio = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_chunk_trans_extent(inode, 1); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_nofolio; + } + + ext4_write_lock_xattr(inode, &no_expand); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); + if (ret < 0) + goto out; + } + + ext4_fc_track_inode(handle, inode); + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) { + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block_unwritten); + } else + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block); + clear_buffer_new(folio_buffers(folio)); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); + } + + if (ret) { + folio_unlock(folio); + folio_put(folio); + folio = NULL; + ext4_orphan_add(handle, inode); + ext4_write_unlock_xattr(inode, &no_expand); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (folio) + block_commit_write(folio, from, to); +out: + if (folio) { + folio_unlock(folio); + folio_put(folio); + } +out_nofolio: + if (sem_held) + ext4_write_unlock_xattr(inode, &no_expand); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) +{ + int ret; + handle_t *handle; + struct folio *folio; + struct ext4_iloc iloc; + int retries = 0; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_release_bh; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_stop_journal; + + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } + + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } + + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_stop_journal; + } + + down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_folio; + } + + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); + if (ret < 0) + goto out_release_folio; + } + + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; + up_read(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio) +{ + handle_t *handle = ext4_journal_current_handle(); + int no_expand; + void *kaddr; + struct ext4_iloc iloc; + int ret = 0, ret2; + + if (unlikely(copied < len) && !folio_test_uptodate(folio)) + copied = 0; + + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + folio_unlock(folio); + folio_put(folio); + ext4_std_error(inode->i_sb, ret); + goto out; + } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); + + /* + * ei->i_inline_off may have changed since + * ext4_write_begin() called + * ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); + + kaddr = kmap_local_folio(folio, 0); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_local(kaddr); + folio_mark_uptodate(folio); + /* clear dirty flag so that writepages wouldn't work for us. */ + folio_clear_dirty(folio); + + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); + + /* + * It's important to update i_size while still holding folio + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); + } + folio_unlock(folio); + folio_put(folio); + + /* + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling + * filesystems. + */ + if (likely(copied)) + mark_inode_dirty(inode); +out: + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; +} + +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metadata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata) +{ + int ret = 0, inline_size; + struct folio *folio; + + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); + if (ret < 0) + goto out; + } + + ret = ext4_block_write_begin(NULL, folio, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); + ext4_truncate_failed_write(inode); + return ret; + } + + clear_buffer_new(folio_buffers(folio)); + folio_mark_dirty(folio); + folio_mark_uptodate(folio); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } + return ret; +} + +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + int err; + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, iloc->bh, inline_start, + inline_size, fname, &de); + if (err) + return err; + + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, + EXT4_JTR_NONE); + if (err) + return err; + ext4_insert_dentry(dir, inode, de, inline_size, fname); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + ext4_update_dx_flag(dir); + inode_inc_iversion(dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + /* + * Make sure the inline directory entries pass checks before we try to + * convert them, so that we avoid touching stuff that needs fsck. + */ + if (S_ISDIR(inode->i_mode)) { + error = ext4_check_all_de(inode, iloc->bh, + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (error) + goto out; + } + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -ENOMEM; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, + EXT4_JTR_NONE); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + unlock_buffer(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + unlock_buffer(data_bh); + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + + error = ext4_init_dirblock(handle, inode, data_bh, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (!error) + error = ext4_mark_inode_dirty(handle, inode); + } + +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode) +{ + int ret, ret2, inline_size, no_expand; + void *inline_start; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + ext4_write_lock_xattr(dir, &no_expand); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, fname, dir, + inode, &iloc, inline_start, + inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_write_unlock_xattr(dir, &no_expand); + ret2 = ext4_mark_inode_dirty(handle, dir); + if (unlikely(ret2 && !ret)) + ret = ret2; + brelse(iloc.bh); + return ret; +} + +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + struct fscrypt_str tmp_str; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + memcpy(fake.name, ".", 2); + fake.rec_len = ext4_rec_len_to_disk( + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + memcpy(fake.name, "..", 3); + fake.rec_len = ext4_rec_len_to_disk( + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + if (ext4_hash_in_dirent(dir)) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (err) { + ret = err; + goto out; + } + } + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, hinfo->hash, + hinfo->minor_hash, de, &tmp_str); + if (err) { + ret = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ +int ext4_read_inline_dir(struct file *file, + struct dir_context *ctx, + int *has_inline_data) +{ + unsigned int offset, parent_ino; + int i; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = file_inode(file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; + struct dir_private_info *info = file->private_data; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + ret = 0; + sb = inode->i_sb; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = ctx->pos; + + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = ext4_dir_rec_len(1, NULL); + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + /* + * If the cookie has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (!inode_eq_iversion(inode, info->cookie)) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ + if (!i) { + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; + continue; + } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ + de = (struct ext4_dir_entry_2 *) + (dir_buf + i - extra_offset); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) + < ext4_dir_rec_len(1, NULL)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); + } + offset = i; + ctx->pos = offset; + info->cookie = inode_query_iversion(inode); + } + + while (ctx->pos < extra_size) { + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_offset; + continue; + } + + if (ctx->pos == dotdot_offset) { + if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_size; + continue; + } + + de = (struct ext4_dir_entry_2 *) + (dir_buf + ctx->pos - extra_offset); + if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, + extra_size, ctx->pos)) + goto out; + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto out; + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +void *ext4_read_inline_link(struct inode *inode) +{ + struct ext4_iloc iloc; + int ret, inline_size; + void *link; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ERR_PTR(ret); + + ret = -ENOMEM; + inline_size = ext4_get_inline_size(inode); + link = kmalloc(inline_size + 1, GFP_NOFS); + if (!link) + goto out; + + ret = ext4_read_inline_data(inode, link, inline_size, &iloc); + if (ret < 0) { + kfree(link); + goto out; + } + nd_terminate_link(link, inode->i_size, ret); +out: + if (ret < 0) + link = ERR_PTR(ret); + brelse(iloc.bh); + return link; +} + +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int ret; + void *inline_start; + int inline_size; + + ret = ext4_get_inode_loc(dir, &is.iloc); + if (ret) + return ERR_PTR(ret); + + down_read(&EXT4_I(dir)->xattr_sem); + + ret = ext4_xattr_ibody_find(dir, &i, &is); + if (ret) + goto out; + + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, + dir, fname, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, + dir, fname, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(is.iloc.bh); + if (ret < 0) + is.iloc.bh = ERR_PTR(ret); + else + is.iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return is.iloc.bh; +} + +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size, no_expand; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + ext4_write_lock_xattr(dir, &no_expand); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto out; + + err = ext4_generic_delete_entry(dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + ext4_write_unlock_xattr(dir, &no_expand); + if (likely(err == 0)) + err = ext4_mark_inode_dirty(handle, dir); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +bool empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + size_t inline_len; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + bool ret = false; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE_ERR(dir, -err, + "error %d getting inode %lu block", + err, dir->i_ino); + return false; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + ret = true; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + goto out; + } + + inline_len = ext4_get_inline_size(dir); + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < inline_len) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + goto out; + } + if (le32_to_cpu(de->inode)) { + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + + ret = true; +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret, no_expand; + + ext4_write_lock_xattr(inode, &no_expand); + ret = ext4_destroy_inline_data_nolock(handle, inode); + ext4_write_unlock_xattr(inode, &no_expand); + + return ret; +} + +int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) +{ + __u64 addr; + int error = -EAGAIN; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) + goto out; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + addr += offsetof(struct ext4_inode, i_block); + + brelse(iloc.bh); + + iomap->addr = addr; + iomap->offset = 0; + iomap->length = min_t(loff_t, ext4_get_inline_size(inode), + i_size_read(inode)); + iomap->type = IOMAP_INLINE; + iomap->flags = 0; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +int ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks, no_expand, err = 0; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_chunk_trans_extent(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_write_lock_xattr(inode, &no_expand); + if (!ext4_has_inline_data(inode)) { + ext4_write_unlock_xattr(inode, &no_expand); + *has_inline = 0; + ext4_journal_stop(handle); + return 0; + } + + if ((err = ext4_orphan_add(handle, inode)) != 0) + goto out; + + if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* + * if there's inline data to truncate and this file was + * converted to extents after that inline data was written, + * the extent status cache must be cleared to avoid leaving + * behind stale delayed allocated extent entries + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) + goto out_error; + + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, + "missing inline data xattr"); + err = -EFSCORRUPTED; + goto out_error; + } + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) { + err = -ENOMEM; + goto out_error; + } + + err = ext4_xattr_ibody_get(inode, i.name_index, + i.name, value, value_len); + if (err <= 0) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + err = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (err) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + ext4_write_unlock_xattr(inode, &no_expand); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + if (err == 0) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + err = ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } + ext4_journal_stop(handle); + return err; +} + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks, no_expand; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + /* + * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is + * cleared. This means we are in the middle of moving of + * inline data to delay allocated block. Just force writeout + * here to finish conversion. + */ + error = filemap_flush(inode->i_mapping); + if (error) + return error; + if (!ext4_has_inline_data(inode)) + return 0; + } + + needed_blocks = ext4_chunk_trans_extent(inode, 1); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + ext4_write_lock_xattr(inode, &no_expand); + if (ext4_has_inline_data(inode)) + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + ext4_write_unlock_xattr(inode, &no_expand); + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy inode-test.c, ioctl.c, mballoc.h, and mballoc-test.c from Linux v6.18 fs/ext4 directory. - inode-test: KUnit tests for inode timestamp handling - ioctl: filesystem ioctl handlers - mballoc.h: multiblock allocator header - mballoc-test: KUnit tests for multiblock allocator Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/inode-test.c | 283 ++++++ fs/ext4l/ioctl.c | 2020 +++++++++++++++++++++++++++++++++++++++ fs/ext4l/mballoc-test.c | 999 +++++++++++++++++++ fs/ext4l/mballoc.h | 273 ++++++ 4 files changed, 3575 insertions(+) create mode 100644 fs/ext4l/inode-test.c create mode 100644 fs/ext4l/ioctl.c create mode 100644 fs/ext4l/mballoc-test.c create mode 100644 fs/ext4l/mballoc.h diff --git a/fs/ext4l/inode-test.c b/fs/ext4l/inode-test.c new file mode 100644 index 00000000000..749af7ad4e0 --- /dev/null +++ b/fs/ext4l/inode-test.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 inode that verify the seconds part of [a/c/m] + * timestamps in ext4 inode structs are decoded correctly. + */ + +#include <kunit/test.h> +#include <linux/kernel.h> +#include <linux/time64.h> + +#include "ext4.h" + +/* + * For constructing the nonnegative timestamp lower bound value. + * binary: 00000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_0 0L +/* + * For constructing the nonnegative timestamp upper bound value. + * binary: 01111111 11111111 11111111 11111111 + * + */ +#define UPPER_MSB_0 0x7fffffffL +/* + * For constructing the negative timestamp lower bound value. + * binary: 10000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ +/* + * For constructing the negative timestamp upper bound value. + * binary: 11111111 11111111 11111111 11111111 + */ +#define UPPER_MSB_1 (-1L) +/* + * Upper bound for nanoseconds value supported by the encoding. + * binary: 00111111 11111111 11111111 11111111 + */ +#define MAX_NANOSECONDS ((1L << 30) - 1) + +#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x" + +#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits" +#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits" +#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits" +#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits" +#define LOWER_BOUND_NEG_LO_1_CASE\ + "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NEG_LO_1_CASE\ + "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NONNEG_LO_1_CASE\ + "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NONNEG_LO_1_CASE\ + "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NEG_HI_1_CASE\ + "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NEG_HI_1_CASE\ + "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on" +#define LOWER_BOUND_NONNEG_HI_1_CASE\ + "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_CASE\ + "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\ + "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns" +#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\ + "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns" +#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on" +#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on" + +struct timestamp_expectation { + const char *test_case_name; + struct timespec64 expected; + u32 extra_bits; + bool msb_set; + bool lower_bound; +}; + +static const struct timestamp_expectation test_data[] = { + { + .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 0, + .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = -1LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0, + .expected = {0LL, 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 6, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0xFFFFFFFF, + .expected = {.tv_sec = 0x300000000LL, + .tv_nsec = MAX_NANOSECONDS}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 3, + .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 3, + .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L}, + } +}; + +static void timestamp_expectation_to_desc(const struct timestamp_expectation *t, + char *desc) +{ + strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc); + +static time64_t get_32bit_time(const struct timestamp_expectation * const test) +{ + if (test->msb_set) { + if (test->lower_bound) + return LOWER_MSB_1; + + return UPPER_MSB_1; + } + + if (test->lower_bound) + return LOWER_MSB_0; + return UPPER_MSB_0; +} + + +/* + * Test data is derived from the table in the Inode Timestamps section of + * Documentation/filesystems/ext4/inodes.rst. + */ +static void inode_test_xtimestamp_decoding(struct kunit *test) +{ + struct timespec64 timestamp; + + struct timestamp_expectation *test_param = + (struct timestamp_expectation *)(test->param_value); + + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); + + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_sec, + timestamp.tv_sec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_nsec, + timestamp.tv_nsec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); +} + +static struct kunit_case ext4_inode_test_cases[] = { + KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params), + {} +}; + +static struct kunit_suite ext4_inode_test_suite = { + .name = "ext4_inode_test", + .test_cases = ext4_inode_test_cases, +}; + +kunit_test_suites(&ext4_inode_test_suite); + +MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding"); +MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4l/ioctl.c b/fs/ext4l/ioctl.c new file mode 100644 index 00000000000..a93a7baae99 --- /dev/null +++ b/fs/ext4l/ioctl.c @@ -0,0 +1,2020 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/fs.h> +#include <linux/capability.h> +#include <linux/time.h> +#include <linux/compat.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/quotaops.h> +#include <linux/random.h> +#include <linux/uaccess.h> +#include <linux/delay.h> +#include <linux/iversion.h> +#include <linux/fileattr.h> +#include <linux/uuid.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include <trace/events/ext4.h> + +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); + +/* + * Superblock modification callback function for changing file system + * label + */ +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + /* Sanity check, this should never happen */ + BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); + + memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); +} + +/* + * Superblock modification callback function for changing file system + * UUID. + */ +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); +} + +static +int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, + ext4_update_sb_callback func, + const void *arg) +{ + int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = sbi->s_sbh; + struct ext4_super_block *es = sbi->s_es; + + trace_ext4_update_sb(sb, bh->b_blocknr, 1); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + + lock_buffer(bh); + func(sbi, es, arg); + ext4_superblock_csum_set(sb); + unlock_buffer(bh); + + if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_err; + err = sync_dirty_buffer(bh); +out_err: + ext4_std_error(sb, err); + return err; +} + +/* + * Update one backup superblock in the group 'grp' using the callback + * function 'func' and argument 'arg'. If the handle is NULL the + * modification is not journalled. + * + * Returns: 0 when no modification was done (no superblock in the group) + * 1 when the modification was successful + * <0 on error + */ +static int ext4_update_backup_sb(struct super_block *sb, + handle_t *handle, ext4_group_t grp, + ext4_update_sb_callback func, const void *arg) +{ + int err = 0; + ext4_fsblk_t sb_block; + struct buffer_head *bh; + unsigned long offset = 0; + struct ext4_super_block *es; + + if (!ext4_bg_has_super(sb, grp)) + return 0; + + /* + * For the group 0 there is always 1k padding, so we have + * either adjust offset, or sb_block depending on blocksize + */ + if (grp == 0) { + sb_block = 1 * EXT4_MIN_BLOCK_SIZE; + offset = do_div(sb_block, sb->s_blocksize); + } else { + sb_block = ext4_group_first_block_no(sb, grp); + offset = 0; + } + + trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); + + bh = ext4_sb_bread(sb, sb_block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_bh; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + lock_buffer(bh); + if (ext4_has_feature_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(es)) { + ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " + "superblock %llu", sb_block); + unlock_buffer(bh); + goto out_bh; + } + func(EXT4_SB(sb), es, arg); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if (handle) { + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_bh; + } else { + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + } + err = sync_dirty_buffer(bh); + +out_bh: + brelse(bh); + ext4_std_error(sb, err); + return (err) ? err : 1; +} + +/* + * Update primary and backup superblocks using the provided function + * func and argument arg. + * + * Only the primary superblock and at most two backup superblock + * modifications are journalled; the rest is modified without journal. + * This is safe because e2fsck will re-write them if there is a problem, + * and we're very unlikely to ever need more than two backups. + */ +static +int ext4_update_superblocks_fn(struct super_block *sb, + ext4_update_sb_callback func, + const void *arg) +{ + handle_t *handle; + ext4_group_t ngroups; + unsigned int three = 1; + unsigned int five = 5; + unsigned int seven = 7; + int err = 0, ret, i; + ext4_group_t grp, primary_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * We can't update superblocks while the online resize is running + */ + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, "Can't modify superblock while" + "performing online resize"); + return -EBUSY; + } + + /* + * We're only going to update primary superblock and two + * backup superblocks in this transaction. + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + /* Update primary superblock */ + err = ext4_update_primary_sb(sb, handle, func, arg); + if (err) { + ext4_msg(sb, KERN_ERR, "Failed to update primary " + "superblock"); + goto out_journal; + } + + primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); + ngroups = ext4_get_groups_count(sb); + + /* + * Update backup superblocks. We have to start from group 0 + * because it might not be where the primary superblock is + * if the fs is mounted with -o sb=<backup_sb_block> + */ + i = 0; + grp = 0; + while (grp < ngroups) { + /* Skip primary superblock */ + if (grp == primary_grp) + goto next_grp; + + ret = ext4_update_backup_sb(sb, handle, grp, func, arg); + if (ret < 0) { + /* Ignore bad checksum; try to update next sb */ + if (ret == -EFSBADCRC) + goto next_grp; + err = ret; + goto out_journal; + } + + i += ret; + if (handle && i > 1) { + /* + * We're only journalling primary superblock and + * two backup superblocks; the rest is not + * journalled. + */ + err = ext4_journal_stop(handle); + if (err) + goto out; + handle = NULL; + } +next_grp: + grp = ext4_list_backups(sb, &three, &five, &seven); + } + +out_journal: + if (handle) { + ret = ext4_journal_stop(handle); + if (ret && !err) + err = ret; + } +out: + clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); + smp_mb__after_atomic(); + return err ? err : 0; +} + +/* + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + swap(*ap, *bp); + ap++; + bp++; + } +} + +/* + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + unsigned long tmp; + struct timespec64 ts1, ts2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + swap(inode1->i_version, inode2->i_version); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); + swap(ei1->i_disksize, ei2->i_disksize); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +void ext4_reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); +} + +/* + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @idmap: idmap of the mount the inode was found from + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct mnt_idmap *idmap, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei_bl; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); + ei_bl = EXT4_I(inode_bl); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + lock_two_nondirectories(inode, inode_bl); + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(idmap, inode) || + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + + /* Wait for all existing dio workers */ + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto err_out; + } + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_set_iversion(inode_bl, 1); + i_size_write(inode_bl, 0); + EXT4_I(inode_bl)->i_disksize = inode_bl->i_size; + inode_bl->i_mode = S_IFREG; + if (ext4_has_feature_extents(sb)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; + swap_inode_data(inode, inode_bl); + + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); + inode_inc_iversion(inode); + + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); + ext4_reset_inode_seed(inode); + ext4_reset_inode_seed(inode_bl); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + goto err_out1; + } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: + ext4_journal_stop(handle); + ext4_double_up_write_data_sem(inode, inode_bl); + +err_out: + filemap_invalidate_unlock(inode->i_mapping); +journal_err_out: + unlock_two_nondirectories(inode, inode_bl); + iput(inode_bl); + return err; +} + +/* + * If immutable is set and we are not clearing it, we're not allowed to change + * anything else in the inode. Don't error out if we're only trying to set + * immutable on an immutable file. + */ +static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int oldflags = ei->i_flags; + + if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) + return 0; + + if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) + return -EPERM; + if (ext4_has_feature_project(inode->i_sb) && + __kprojid_val(ei->i_projid) != new_projid) + return -EPERM; + + return 0; +} + +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + /* Allow the DAX flag to be changed on inline directories */ + if (S_ISDIR(inode->i_mode)) { + flags &= ~EXT4_INLINE_DATA_FL; + oldflags &= ~EXT4_INLINE_DATA_FL; + } + + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = -EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + struct super_block *sb = inode->i_sb; + + /* Is it quota file? Do not allow user to mess with it */ + if (ext4_is_quota_file(inode)) + goto flags_out; + + oldflags = ei->i_flags; + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { + if (!ext4_has_feature_casefold(sb)) { + err = -EOPNOTSUPP; + goto flags_out; + } + + if (!S_ISDIR(inode->i_mode)) { + err = -ENOTDIR; + goto flags_out; + } + + if (!ext4_empty_dir(inode)) { + err = -ENOTEMPTY; + goto flags_out; + } + } + + /* + * Wait for all pending directio and then flush all the dirty pages + * for this file. The flush marks all the pages readonly, so any + * subsequent attempt to write to the file (particularly mmap pages) + * will come through the filesystem and fail. + */ + if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && + (flags & EXT4_IMMUTABLE_FL)) { + inode_dio_wait(inode); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto flags_out; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + ext4_dax_dontcache(inode, flags); + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode, false); + + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + /* + * Changes to the journaling mode can cause unsafe changes to + * S_DAX if the inode is DAX + */ + if (IS_DAX(inode)) { + err = -EBUSY; + goto flags_out; + } + + err = ext4_change_inode_journal_flag(inode, + flags & EXT4_JOURNAL_DATA_FL); + if (err) + goto flags_out; + } + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) +{ + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct dquot *transfer_to[MAXQUOTAS] = { }; + + if (!ext4_has_feature_project(sb)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = -EPERM; + /* Is it quota file? Do not allow user to mess with it */ + if (ext4_is_quota_file(inode)) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = ext4_expand_extra_isize(inode, + EXT4_SB(sb)->s_want_extra_isize, + &iloc); + if (err) + return err; + } else { + brelse(iloc.bh); + } + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); + err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + EXT4_I(inode)->i_projid = kprojid; + inode_set_ctime_current(inode); + inode_inc_iversion(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); + return err; +} +#else +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +int ext4_force_shutdown(struct super_block *sb, u32 flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int ret; + + if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (ext4_forced_shutdown(sb)) + return 0; + + ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); + trace_ext4_shutdown(sb, flags); + + switch (flags) { + case EXT4_GOING_FLAGS_DEFAULT: + ret = bdev_freeze(sb->s_bdev); + if (ret) + return ret; + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + bdev_thaw(sb->s_bdev); + break; + case EXT4_GOING_FLAGS_LOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { + (void) ext4_force_commit(sb); + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); + } + break; + case EXT4_GOING_FLAGS_NOLOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); + break; + default: + return -EINVAL; + } + clear_opt(sb, DISCARD); + return 0; +} + +static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) +{ + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + return ext4_force_shutdown(sb, flags); +} + +struct getfsmap_info { + struct super_block *gi_sb; + struct fsmap_head __user *gi_data; + unsigned int gi_idx; + __u32 gi_last_flags; +}; + +static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_ext4_getfsmap_mapping(info->gi_sb, xfm); + + info->gi_last_flags = xfm->fmr_flags; + ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); + if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +static int ext4_ioc_getfsmap(struct super_block *sb, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = { NULL }; + struct ext4_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + /* + * ext4 doesn't report file extents at all, so the only valid + * file offsets are the magic ones (all zeroes or all ones). + */ + if (head.fmh_keys[0].fmr_offset || + (head.fmh_keys[1].fmr_offset != 0 && + head.fmh_keys[1].fmr_offset != -1ULL)) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); + trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); + + info.gi_sb = sb; + info.gi_data = arg; + error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); + if (error == EXT4_QUERY_RANGE_ABORT) + aborted = true; + else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.gi_idx) { + info.gi_last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, + &info.gi_last_flags, + sizeof(info.gi_last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + +static long ext4_ioctl_group_add(struct file *file, + struct ext4_new_group_data *input) +{ + struct super_block *sb = file_inode(file)->i_sb; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(file); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(file); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input->group); +group_add_out: + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; + return err; +} + +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct ext4_inode_info *ei = EXT4_I(inode); + u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + + if (S_ISREG(inode->i_mode)) + flags &= ~FS_PROJINHERIT_FL; + + fileattr_fill_flags(fa, flags); + if (ext4_has_feature_project(inode->i_sb)) + fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); + + return 0; +} + +int ext4_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 flags = fa->flags; + int err = -EOPNOTSUPP; + + if (flags & ~EXT4_FL_USER_VISIBLE) + goto out; + + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + goto out; + err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags); + if (err) + goto out; + err = ext4_ioctl_setflags(inode, flags); + if (err) + goto out; + err = ext4_ioctl_setproject(inode, fa->fsx_projid); +out: + return err; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + int error; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) +{ + int err = 0; + __u32 flags = 0; + unsigned int flush_flags = 0; + struct super_block *sb = file_inode(filp)->i_sb; + + if (copy_from_user(&flags, (__u32 __user *)arg, + sizeof(__u32))) + return -EFAULT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* check for invalid bits set */ + if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!EXT4_SB(sb)->s_journal) + return -ENODEV; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) + return -EOPNOTSUPP; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + return 0; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) + flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { + flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); + } + + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + + return err; +} + +static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) +{ + size_t len; + int ret = 0; + char new_label[EXT4_LABEL_MAX + 1]; + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Copy the maximum length allowed for ext4 label with one more to + * find the required terminating null byte in order to test the + * label length. The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) + return -EFAULT; + + len = strnlen(new_label, EXT4_LABEL_MAX + 1); + if (len > EXT4_LABEL_MAX) + return -EINVAL; + + /* + * Clear the buffer after the new label + */ + memset(new_label + len, 0, EXT4_LABEL_MAX - len); + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); + + mnt_drop_write_file(filp); + return ret; +} + +static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) +{ + char label[EXT4_LABEL_MAX + 1]; + + /* + * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because + * FSLABEL_MAX must include terminating null byte, while s_volume_name + * does not have to. + */ + BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); + + lock_buffer(sbi->s_sbh); + memtostr_pad(label, sbi->s_es->s_volume_name); + unlock_buffer(sbi->s_sbh); + + if (copy_to_user(user_label, label, sizeof(label))) + return -EFAULT; + return 0; +} + +static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, + struct fsuuid __user *ufsuuid) +{ + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len == 0) { + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, + sizeof(fsuuid.fsu_len))) + return -EFAULT; + return 0; + } + + if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + lock_buffer(sbi->s_sbh); + memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); + unlock_buffer(sbi->s_sbh); + + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || + copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + return -EFAULT; + return 0; +} + +static int ext4_ioctl_setuuid(struct file *filp, + const struct fsuuid __user *ufsuuid) +{ + int ret = 0; + struct super_block *sb = file_inode(filp)->i_sb; + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * If any checksums (group descriptors or metadata) are being used + * then the checksum seed feature is required to change the UUID. + */ + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) + && !ext4_has_feature_csum_seed(sb)) + || ext4_has_feature_stable_inodes(sb)) + return -EOPNOTSUPP; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid); + mnt_drop_write_file(filp); + + return ret; +} + + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + +static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct mnt_idmap *idmap = file_mnt_idmap(filp); + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case FS_IOC_GETFSMAP: + return ext4_ioc_getfsmap(sb, (void __user *)arg); + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(idmap, inode)) + return -EPERM; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); + +unlock_out: + inode_unlock(inode); +setversion_out: + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_extend_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_extend_out; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); +group_extend_out: + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; + return err; + } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + me.moved_len = 0; + + CLASS(fd, donor)(me.donor_fd); + if (fd_empty(donor)) + return -EBADF; + + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) + return -EBADF; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = ext4_move_extents(filp, fd_file(donor), me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) + err = -EFAULT; + return err; + } + + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + return ext4_ioctl_group_add(filp, &input); + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + inode_lock((inode)); + err = ext4_ext_migrate(inode); + inode_unlock((inode)); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_SWAP_BOOT: + { + int err; + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, idmap, inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write_file(filp); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); + if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + +resizefs_out: + err2 = ext4_resize_end(sb, true); + if (err == 0) + err = err2; + return err; + } + + case FITRIM: + { + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bdev_max_discard_sectors(sb->s_bdev)) + return -EOPNOTSUPP; + + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + case EXT4_IOC_PRECACHE_EXTENTS: + { + int ret; + + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } + case FS_IOC_SET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_PWSALT: + return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_add_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key_all_users(filp, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); + + case EXT4_IOC_CLEAR_ES_CACHE: + { + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + ext4_clear_inode_es(inode); + return 0; + } + + case EXT4_IOC_GETSTATE: + { + __u32 state = 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) + state |= EXT4_STATE_FLAG_EXT_PRECACHED; + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + state |= EXT4_STATE_FLAG_NEW; + if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + state |= EXT4_STATE_FLAG_NEWENTRY; + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) + state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; + + return put_user(state, (__u32 __user *) arg); + } + + case EXT4_IOC_GET_ES_CACHE: + return ext4_ioctl_get_es_cache(filp, arg); + + case EXT4_IOC_SHUTDOWN: + return ext4_ioctl_shutdown(sb, arg); + + case FS_IOC_ENABLE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_enable(filp, (const void __user *)arg); + + case FS_IOC_MEASURE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_measure(filp, (void __user *)arg); + + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + + case EXT4_IOC_CHECKPOINT: + return ext4_ioctl_checkpoint(filp, arg); + + case FS_IOC_GETFSLABEL: + return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); + + case FS_IOC_SETFSLABEL: + return ext4_ioctl_setlabel(filp, + (const void __user *)arg); + + case EXT4_IOC_GETFSUUID: + return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); + case EXT4_IOC_SETFSUUID: + return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); + default: + return -ENOTTY; + } +} + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + return __ext4_ioctl(filp, cmd, arg); +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_data input; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + return ext4_ioctl_group_add(file, &input); + } + case EXT4_IOC_MOVE_EXT: + case EXT4_IOC_RESIZE_FS: + case FITRIM: + case EXT4_IOC_PRECACHE_EXTENTS: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: + case EXT4_IOC_SHUTDOWN: + case FS_IOC_GETFSMAP: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: + case EXT4_IOC_CLEAR_ES_CACHE: + case EXT4_IOC_GETSTATE: + case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_CHECKPOINT: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + case EXT4_IOC_GETFSUUID: + case EXT4_IOC_SETFSUUID: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); +} + +int ext4_update_overhead(struct super_block *sb, bool force) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (ext4_emergency_state(sb) || sb_rdonly(sb)) + return 0; + if (!force && + (sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) + return 0; + return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); +} diff --git a/fs/ext4l/mballoc-test.c b/fs/ext4l/mballoc-test.c new file mode 100644 index 00000000000..a9416b20ff6 --- /dev/null +++ b/fs/ext4l/mballoc-test.c @@ -0,0 +1,999 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include <kunit/test.h> +#include <kunit/static_stub.h> +#include <linux/random.h> + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct ext4_super_block es; + struct ext4_sb_info sbi; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + +static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; + + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; + + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) + goto out; + + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + + sbi->s_es = &fsb->es; + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; + +out_deactivate: + deactivate_locked_super(sb); +out: + kfree(fsb); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = MBT_SB(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_mb_release(sb); + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4l/mballoc.h b/fs/ext4l/mballoc.h new file mode 100644 index 00000000000..15a049f05d0 --- /dev/null +++ b/fs/ext4l/mballoc.h @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas <alex@clusterfs.com> + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> +#include <linux/mutex.h> +#include "ext4_jbd2.h" +#include "ext4.h" + +/* + * mb_debug() dynamic printk msgs could be used to debug mballoc code. + */ +#ifdef CONFIG_EXT4_DEBUG +#define mb_debug(sb, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ + current->comm, task_pid_nr(current), sb->s_id, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) +#else +#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * with 's_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 0 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4/<partition>/stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + +/* + * Number of groups to search linearly before performing group scanning + * optimization. + */ +#define MB_DEFAULT_LINEAR_LIMIT 4 + +/* + * Minimum number of groups that should be present in the file system to perform + * group scanning optimizations. + */ +#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 + +/* + * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular + * allocation request. Example, if we have an order 7 request and max trim order + * of 3, we can trim this request upto order 4. + */ +#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 + +/* + * Number of valid buddy orders + */ +#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) + +struct ext4_free_data { + /* this links the free block information from sb_info */ + struct list_head efd_list; + + /* this links the free block information from group_info */ + struct rb_node efd_node; + + /* group which free block extent belongs */ + ext4_group_t efd_group; + + /* free block extent */ + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; + + /* transaction which freed this extent */ + tid_t efd_tid; +}; + +struct ext4_prealloc_space { + union { + struct rb_node inode_node; /* for inode PA rbtree */ + struct list_head lg_list; /* for lg PAs */ + } pa_node; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + union { + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ + } pa_node_lock; + struct inode *pa_inode; /* used to get the inode during group discard */ +}; + +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; /* In cluster units */ + ext4_group_t fe_group; + ext4_grpblk_t fe_len; /* In cluster units */ +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (normalized ac_o_ex) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. + */ + ext4_grpblk_t ac_orig_goal_len; + + ext4_group_t ac_prefetch_grp; + unsigned int ac_prefetch_ios; + unsigned int ac_prefetch_nr; + + int ac_first_err; + + __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_cX_found[EXT4_MB_NUM_CRS]; + __u16 ac_tail; + __u16 ac_buddy; + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + + struct ext4_buddy *ac_e4b; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_buddy { + struct folio *bd_buddy_folio; + void *bd_buddy; + struct folio *bd_bitmap_folio; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); +} + +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + +typedef int (*ext4_mballoc_query_range_fn)( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t len, + void *priv); + +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, + ext4_mballoc_query_range_fn formatter, + void *priv); + +#endif -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy migrate.c, mmp.c, move_extent.c, orphan.c, and page-io.c from Linux v6.18 fs/ext4 directory. - migrate: migration between indirect and extent mapping - mmp: multi-mount protection - move_extent: extent moving ioctl support - orphan: orphan inode handling - page-io: page I/O operations Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/migrate.c | 672 +++++++++++++++++++++++++++++++++++++++ fs/ext4l/mmp.c | 408 ++++++++++++++++++++++++ fs/ext4l/move_extent.c | 706 +++++++++++++++++++++++++++++++++++++++++ fs/ext4l/orphan.c | 659 ++++++++++++++++++++++++++++++++++++++ fs/ext4l/page-io.c | 593 ++++++++++++++++++++++++++++++++++ 5 files changed, 3038 insertions(+) create mode 100644 fs/ext4l/migrate.c create mode 100644 fs/ext4l/mmp.c create mode 100644 fs/ext4l/move_extent.c create mode 100644 fs/ext4l/orphan.c create mode 100644 fs/ext4l/page-io.c diff --git a/fs/ext4l/migrate.c b/fs/ext4l/migrate.c new file mode 100644 index 00000000000..1b0dfd963d3 --- /dev/null +++ b/fs/ext4l/migrate.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright IBM Corporation, 2007 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + */ + +#include <linux/slab.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" + +/* + * The contiguous blocks details which can be + * represented by a single extent + */ +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; + ext4_fsblk_t first_pblock, last_pblock; +}; + +static int finish_range(handle_t *handle, struct inode *inode, + struct migrate_struct *lb) + +{ + int retval = 0, needed; + struct ext4_extent newext; + struct ext4_ext_path *path; + if (lb->first_pblock == 0) + return 0; + + /* Add the extent to temp inode*/ + newext.ee_block = cpu_to_le32(lb->first_block); + newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); + ext4_ext_store_pblock(&newext, lb->first_pblock); + /* Locking only for convenience since we are operating on temp inode */ + down_write(&EXT4_I(inode)->i_data_sem); + path = ext4_find_extent(inode, lb->first_block, NULL, 0); + if (IS_ERR(path)) { + retval = PTR_ERR(path); + goto err_out; + } + + /* + * Calculate the credit needed to inserting this extent + * Since we are doing this in loop we may accumulate extra + * credit. But below we try to not accumulate too much + * of them by restarting the journal. + */ + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); + + retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); + if (retval < 0) + goto err_out; + path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + if (IS_ERR(path)) + retval = PTR_ERR(path); +err_out: + up_write((&EXT4_I(inode)->i_data_sem)); + ext4_free_ext_path(path); + lb->first_pblock = 0; + return retval; +} + +static int update_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, struct migrate_struct *lb) +{ + int retval; + /* + * See if we can add on to the existing range (if it exists) + */ + if (lb->first_pblock && + (lb->last_pblock+1 == pblock) && + (lb->last_block+1 == lb->curr_block)) { + lb->last_pblock = pblock; + lb->last_block = lb->curr_block; + lb->curr_block++; + return 0; + } + /* + * Start a new range. + */ + retval = finish_range(handle, inode, lb); + lb->first_pblock = lb->last_pblock = pblock; + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; + return retval; +} + +static int update_ind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + lb->curr_block++; + } + } + put_bh(bh); + return retval; + +} + +static int update_dind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_ind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int update_tind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_dind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries * max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int free_dind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i; + __le32 *tmp_idata; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + int err; + + bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + err = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) { + put_bh(bh); + return err; + } + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + } + put_bh(bh); + err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) + return err; + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_tind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i, retval = 0; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + retval = free_dind_blocks(handle, + inode, tmp_idata[i]); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) +{ + int retval; + + /* ei->i_data[EXT4_IND_BLOCK] */ + if (i_data[0]) { + retval = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + + /* ei->i_data[EXT4_DIND_BLOCK] */ + if (i_data[1]) { + retval = free_dind_blocks(handle, inode, i_data[1]); + if (retval) + return retval; + } + + /* ei->i_data[EXT4_TIND_BLOCK] */ + if (i_data[2]) { + retval = free_tind_blocks(handle, inode, i_data[2]); + if (retval) + return retval; + } + return 0; +} + +static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, + struct inode *tmp_inode) +{ + int retval, retval2 = 0; + __le32 i_data[3]; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); + + /* + * One credit accounted for writing the + * i_data field of the original inode + */ + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto err_out; + + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; + + down_write(&EXT4_I(inode)->i_data_sem); + /* + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across + */ + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); + + /* + * Update i_blocks with the new blocks that got + * allocated while adding extents for extent index + * blocks. + * + * While converting to extents we need not + * update the original inode i_blocks for extent blocks + * via quota APIs. The quota update happened via tmp_inode already. + */ + spin_lock(&inode->i_lock); + inode->i_blocks += tmp_inode->i_blocks; + spin_unlock(&inode->i_lock); + up_write(&EXT4_I(inode)->i_data_sem); + + /* + * We mark the inode dirty after, because we decrement the + * i_blocks when freeing the indirect meta-data blocks + */ + retval = free_ind_block(handle, inode, i_data); + retval2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(retval2 && !retval)) + retval = retval2; + +err_out: + return retval; +} + +static int free_ext_idx(handle_t *handle, struct inode *inode, + struct ext4_extent_idx *ix) +{ + int i, retval = 0; + ext4_fsblk_t block; + struct buffer_head *bh; + struct ext4_extent_header *eh; + + block = ext4_idx_pblock(ix); + bh = ext4_sb_bread(inode->i_sb, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + eh = (struct ext4_extent_header *)bh->b_data; + if (eh->eh_depth != 0) { + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +/* + * Free the extent meta data blocks only + */ +static int free_ext_block(handle_t *handle, struct inode *inode) +{ + int i, retval = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; + struct ext4_extent_idx *ix; + if (eh->eh_depth == 0) + /* + * No extra blocks allocated for extent meta data + */ + return 0; + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + return retval; + } + return retval; +} + +int ext4_ext_migrate(struct inode *inode) +{ + handle_t *handle; + int retval = 0, i; + __le32 *i_data; + struct ext4_inode_info *ei; + struct inode *tmp_inode = NULL; + struct migrate_struct lb; + unsigned long max_entries; + __u32 goal, tmp_csum_seed; + uid_t owner[2]; + int alloc_ctx; + + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!ext4_has_feature_extents(inode->i_sb) || + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + ext4_has_inline_data(inode)) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) + /* + * don't migrate fast symlink + */ + return retval; + + alloc_ctx = ext4_writepages_down_write(inode->i_sb); + + /* + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need to worry about + * credits for modifying the quota inode. + */ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_unlock; + } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = i_uid_read(inode); + owner[1] = i_gid_read(inode); + tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), + S_IFREG, NULL, goal, owner, 0); + if (IS_ERR(tmp_inode)) { + retval = PTR_ERR(tmp_inode); + ext4_journal_stop(handle); + goto out_unlock; + } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + */ + ei = EXT4_I(inode); + tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; + i_size_write(tmp_inode, i_size_read(inode)); + /* + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. + */ + clear_nlink(tmp_inode); + + ext4_ext_tree_init(handle, tmp_inode); + ext4_journal_stop(handle); + + /* + * start with one credit accounted for + * superblock modification. + * + * For the tmp_inode we already have committed the + * transaction that created the inode. Later as and + * when we add extents we extent the journal + */ + /* + * Even though we take i_rwsem we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. + */ + down_read(&EXT4_I(inode)->i_data_sem); + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_tmp_inode; + } + + i_data = ei->i_data; + memset(&lb, 0, sizeof(lb)); + + /* 32 bit block address 4 bytes */ + max_entries = inode->i_sb->s_blocksize >> 2; + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[i]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block++; + } + if (i_data[EXT4_IND_BLOCK]) { + retval = update_ind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries; + if (i_data[EXT4_DIND_BLOCK]) { + retval = update_dind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries * max_entries; + if (i_data[EXT4_TIND_BLOCK]) { + retval = update_tind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); + if (retval) + goto err_out; + } + /* + * Build the last extent + */ + retval = finish_range(handle, tmp_inode, &lb); +err_out: + if (retval) + /* + * Failure case delete the extent information with the + * tmp_inode + */ + free_ext_block(handle, tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } + + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto out_stop; + /* + * Mark the tmp_inode as of size zero + */ + i_size_write(tmp_inode, 0); + + /* + * set the i_blocks count to zero + * so that the ext4_evict_inode() does the + * right job + * + * We don't need to take the i_lock because + * the inode is not visible to user space. + */ + tmp_inode->i_blocks = 0; + EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); +out_stop: + ext4_journal_stop(handle); +out_tmp_inode: + unlock_new_inode(tmp_inode); + iput(tmp_inode); +out_unlock: + ext4_writepages_up_write(inode->i_sb, alloc_ctx); + return retval; +} + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_lblk_t start, end; + ext4_fsblk_t blk; + handle_t *handle; + int ret, ret2 = 0; + int alloc_ctx; + + if (!ext4_has_feature_extents(inode->i_sb) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (ext4_has_feature_bigalloc(inode->i_sb)) + return -EOPNOTSUPP; + + /* + * In order to get correct extent info, force all delayed allocation + * blocks to be allocated, otherwise delayed allocation blocks may not + * be reflected and bypass the checks on extent header. + */ + if (test_opt(inode->i_sb, DELALLOC)) + ext4_alloc_da_blocks(inode); + + alloc_ctx = ext4_writepages_down_write(inode->i_sb); + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_unlock; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = start = end = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + start = le32_to_cpu(ex->ee_block); + end = start + len - 1; + if (end >= EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i = start; i <= end; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2 && !ret)) + ret = ret2; +errout: + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); +out_unlock: + ext4_writepages_up_write(inode->i_sb, alloc_ctx); + return ret; +} diff --git a/fs/ext4l/mmp.c b/fs/ext4l/mmp.c new file mode 100644 index 00000000000..ab1ff51302f --- /dev/null +++ b/fs/ext4l/mmp.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/random.h> +#include <linux/buffer_head.h> +#include <linux/utsname.h> +#include <linux/kthread.h> + +#include "ext4.h" + +/* Checksumming functions */ +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct mmp_struct, mmp_checksum); + __u32 csum; + + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); + + return cpu_to_le32(csum); +} + +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); +} + +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!ext4_has_feature_metadata_csum(sb)) + return; + + mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); +} + +/* + * Write the MMP block using REQ_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block_thawed(struct super_block *sb, + struct buffer_head *bh) +{ + struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + + ext4_mmp_csum_set(sb, mmp); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return -EIO; + return 0; +} + +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +{ + int err; + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + err = write_mmp_block_thawed(sb, bh); + sb_end_write(sb); + return err; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + int ret; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) { + *bh = sb_getblk(sb, mmp_block); + if (!*bh) { + ret = -ENOMEM; + goto warn_exit; + } + } + + lock_buffer(*bh); + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false); + if (ret) + goto warn_exit; + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { + ret = -EFSCORRUPTED; + goto warn_exit; + } + if (!ext4_mmp_csum_verify(sb, mmp)) { + ret = -EFSBADCRC; + goto warn_exit; + } + return 0; +warn_exit: + brelse(*bh); + *bh = NULL; + ext4_warning(sb, "Error %d while reading MMP block %llu", + ret, mmp_block); + return ret; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, "%s", msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s", + (unsigned long long)le64_to_cpu(mmp->mmp_time), + (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename, + (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = data; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval = 0; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + + memcpy(mmp->mmp_nodename, init_utsname()->nodename, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { + if (!ext4_has_feature_mmp(sb)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + goto wait_to_exit; + } + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(sb, bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval) { + if ((failed_writes % 60) == 0) { + ext4_error_err(sb, -retval, + "Error writing to MMP block"); + } + failed_writes++; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error_err(sb, -retval, + "error reading MMP data: %d", + retval); + goto wait_to_exit; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error_err(sb, EBUSY, "abort"); + put_bh(bh_check); + retval = -EBUSY; + goto wait_to_exit; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MIN_CHECK_INTERVAL, + EXT4_MMP_MAX_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); + + retval = write_mmp_block(sb, bh); + +wait_to_exit: + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop()) + schedule(); + } + set_current_state(TASK_RUNNING); + return retval; +} + +void ext4_stop_mmpd(struct ext4_sb_info *sbi) +{ + if (sbi->s_mmp_tsk) { + kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_mmp_bh); + sbi->s_mmp_tsk = NULL; + } +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1); +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + retval = -EINVAL; + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + retval = -EBUSY; + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + retval = -ETIMEDOUT; + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + retval = -EBUSY; + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); + + /* + * On mount / remount we are protected against fs freezing (by s_umount + * semaphore) and grabbing freeze protection upsets lockdep + */ + retval = write_mmp_block_thawed(sb, bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount"); + retval = -ETIMEDOUT; + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + retval = -EBUSY; + goto failed; + } + + EXT4_SB(sb)->s_mmp_bh = bh; + + BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); + snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname), + "%pg", bh->b_bdev); + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", + (int)sizeof(mmp->mmp_bdevname), + mmp->mmp_bdevname); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + retval = -ENOMEM; + goto failed; + } + + return 0; + +failed: + brelse(bh); + return retval; +} diff --git a/fs/ext4l/move_extent.c b/fs/ext4l/move_extent.c new file mode 100644 index 00000000000..4b091c21908 --- /dev/null +++ b/fs/ext4l/move_extent.c @@ -0,0 +1,706 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato <t-sato@yk.jp.nec.com> + * Akira Fujita <a-fujita@rs.jp.nec.com> + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/slab.h> +#include <linux/sched/mm.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "ext4_extents.h" + +/** + * get_ext_path() - Find an extent path for designated logical block number. + * @inode: inode to be searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path + * + * ext4_find_extent wrapper. Return an extent path pointer on success, + * or an error pointer on failure. + */ +static inline struct ext4_ext_path * +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path *path) +{ + path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return path; + if (path[ext_depth(inode)].p_ext == NULL) { + ext4_free_ext_path(path); + return ERR_PTR(-ENODATA); + } + return path; +} + +/** + * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem + * @first: inode to be locked + * @second: inode to be locked + * + * Acquire write lock of i_data_sem of the two inodes + */ +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) +{ + if (first < second) { + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); + } else { + down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); + + } +} + +/** + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write lock of i_data_sem of two inodes (orig and donor). + */ +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) +{ + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_check_coverage - Check that all extents in range has the same type + * + * @inode: inode in question + * @from: block offset of inode + * @count: block count to be checked + * @unwritten: extents expected to be unwritten + * @err: pointer to save error value + * + * Return 1 if all extents in range has expected type, and zero otherwise. + */ +static int +mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, + int unwritten, int *err) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext; + int ret = 0; + ext4_lblk_t last = from + count; + while (from < last) { + path = get_ext_path(inode, from, path); + if (IS_ERR(path)) { + *err = PTR_ERR(path); + return ret; + } + ext = path[ext_depth(inode)].p_ext; + if (unwritten != ext4_ext_is_unwritten(ext)) + goto out; + from += ext4_ext_get_actual_len(ext); + } + ret = 1; +out: + ext4_free_ext_path(path); + return ret; +} + +/** + * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * @index1: folio index + * @index2: folio index + * @folio: result folio vector + * + * Grab two locked folio for inode's by inode order + */ +static int +mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, struct folio *folio[2]) +{ + struct address_space *mapping[2]; + unsigned int flags; + + BUG_ON(!inode1 || !inode2); + if (inode1 < inode2) { + mapping[0] = inode1->i_mapping; + mapping[1] = inode2->i_mapping; + } else { + swap(index1, index2); + mapping[0] = inode2->i_mapping; + mapping[1] = inode1->i_mapping; + } + + flags = memalloc_nofs_save(); + folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping[0])); + if (IS_ERR(folio[0])) { + memalloc_nofs_restore(flags); + return PTR_ERR(folio[0]); + } + + folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping[1])); + memalloc_nofs_restore(flags); + if (IS_ERR(folio[1])) { + folio_unlock(folio[0]); + folio_put(folio[0]); + return PTR_ERR(folio[1]); + } + /* + * __filemap_get_folio() may not wait on folio's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on folio's writeback + */ + folio_wait_writeback(folio[0]); + folio_wait_writeback(folio[1]); + if (inode1 > inode2) + swap(folio[0], folio[1]); + + return 0; +} + +/* Force folio buffers uptodate w/o dropping folio's lock */ +static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) +{ + struct inode *inode = folio->mapping->host; + sector_t block; + struct buffer_head *bh, *head; + unsigned int blocksize, block_start, block_end; + int nr = 0; + bool partial = false; + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); + + if (folio_test_uptodate(folio)) + return 0; + + blocksize = i_blocksize(inode); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = true; + continue; + } + if (buffer_uptodate(bh)) + continue; + if (!buffer_mapped(bh)) { + int err = ext4_get_block(inode, block, bh, 0); + if (err) + return err; + if (!buffer_mapped(bh)) { + folio_zero_range(folio, block_start, blocksize); + set_buffer_uptodate(bh); + continue; + } + } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + ext4_read_bh_nowait(bh, 0, NULL, false); + nr++; + } while (block++, (bh = bh->b_this_page) != head); + + /* No io required */ + if (!nr) + goto out; + + bh = head; + do { + if (bh_offset(bh) + blocksize <= from) + continue; + if (bh_offset(bh) >= to) + break; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + continue; + return -EIO; + } while ((bh = bh->b_this_page) != head); +out: + if (!partial) + folio_mark_uptodate(folio); + return 0; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @donor_page_offset: page index on donor file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @unwritten: orig extent is unwritten or not + * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling ext4_swap_extents(). + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. + */ +static int +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, pgoff_t donor_page_offset, + int data_offset_in_page, + int block_len_in_page, int unwritten, int *err) +{ + struct inode *orig_inode = file_inode(o_filp); + struct folio *folio[2] = {NULL, NULL}; + handle_t *handle; + ext4_lblk_t orig_blk_offset, donor_blk_offset; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int tmp_data_size, data_size, replaced_size; + int i, err2, jblocks, retries = 0; + int replaced_count = 0; + int from; + int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; + struct super_block *sb = orig_inode->i_sb; + struct buffer_head *bh = NULL; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ +again: + *err = 0; + jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, + block_len_in_page) * 2; + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); + if (IS_ERR(handle)) { + *err = PTR_ERR(handle); + return 0; + } + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + donor_blk_offset = donor_page_offset * blocks_per_page + + data_offset_in_page; + + /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* + * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_size == 0) + tmp_data_size = blocksize; + + data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; + + *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, + donor_page_offset, folio); + if (unlikely(*err < 0)) + goto stop_journal; + /* + * If orig extent was unwritten it can become initialized + * at any time after i_data_sem was dropped, in order to + * serialize with delalloc we have recheck extent while we + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. + */ + if (unwritten) { + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to + * fallback to data copying */ + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + if (!unwritten) { + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { + *err = -EBUSY; + goto drop_data_sem; + } + replaced_count = ext4_swap_extents(handle, orig_inode, + donor_inode, orig_blk_offset, + donor_blk_offset, + block_len_in_page, 1, err); + drop_data_sem: + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto unlock_folios; + } +data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); + *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); + if (*err) + goto unlock_folios; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { + *err = -EBUSY; + goto unlock_folios; + } + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 1, err); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + if (*err) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto unlock_folios; + } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ + bh = folio_buffers(folio[0]); + if (!bh) + bh = create_empty_buffers(folio[0], + 1 << orig_inode->i_blkbits, 0); + for (i = 0; i < from >> orig_inode->i_blkbits; i++) + bh = bh->b_this_page; + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); + if (*err < 0) + goto repair_branches; + bh = bh->b_this_page; + } + + block_commit_write(folio[0], from, from + replaced_size); + + /* Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss */ + *err = ext4_jbd2_inode_add_write(handle, orig_inode, + (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); + +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); +stop_journal: + ext4_journal_stop(handle); + if (*err == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto again; + /* Buffer was busy because probably is pinned to journal transaction, + * force transaction commit may help to free it. */ + if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) + goto again; + return replaced_count; + +repair_branches: + /* + * This should never ever happen! + * Extents are swapped already, but we are not able to copy data. + * Try to swap extents to it's original places + */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 0, &err2); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + if (replaced_count != block_len_in_page) { + ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), + EIO, "Unable to copy data block," + " data will be lost."); + *err = -EIO; + } + replaced_count = 0; + goto unlock_folios; +} + +/** + * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + __u64 orig_eof, donor_eof; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + + /* Ext4 move extent does not support swap files */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -ETXTBSY; + } + + if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EOPNOTSUPP; + } + + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { + ext4_debug("ext4 move extent: orig and donor's start " + "offsets are not aligned [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (donor_start + *len >= EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) + *len = orig_eof - orig_start; + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) + *len = donor_eof - donor_start; + if (!*len) { + ext4_debug("ext4 move extent: len should not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_blk: start offset in block for orig + * @donor_blk: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) +{ + struct inode *orig_inode = file_inode(o_filp); + struct inode *donor_inode = file_inode(d_filp); + struct ext4_ext_path *path = NULL; + int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; + ext4_lblk_t o_end, o_start = orig_blk; + ext4_lblk_t d_start = donor_blk; + int ret; + + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* orig and donor should be different inodes */ + if (orig_inode == donor_inode) { + ext4_debug("ext4 move extent: The argument files should not " + "be same inode [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* TODO: it's not obvious how to swap blocks for inodes with full + journaling enabled */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; + } + + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + + /* Protect orig and donor inodes against a truncate */ + lock_two_nondirectories(orig_inode, donor_inode); + + /* Wait for all existing dio workers */ + inode_dio_wait(orig_inode); + inode_dio_wait(donor_inode); + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, + donor_blk, &len); + if (ret) + goto out; + o_end = o_start + len; + + *moved_len = 0; + while (o_start < o_end) { + struct ext4_extent *ex; + ext4_lblk_t cur_blk, next_blk; + pgoff_t orig_page_index, donor_page_index; + int offset_in_page; + int unwritten, cur_len; + + path = get_ext_path(orig_inode, o_start, path); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto out; + } + ex = path[path->p_depth].p_ext; + cur_blk = le32_to_cpu(ex->ee_block); + cur_len = ext4_ext_get_actual_len(ex); + /* Check hole before the start pos */ + if (cur_blk + cur_len - 1 < o_start) { + next_blk = ext4_ext_next_allocated_block(path); + if (next_blk == EXT_MAX_BLOCKS) { + ret = -ENODATA; + goto out; + } + d_start += next_blk - o_start; + o_start = next_blk; + continue; + /* Check hole after the start pos */ + } else if (cur_blk > o_start) { + /* Skip hole */ + d_start += cur_blk - o_start; + o_start = cur_blk; + /* Extent inside requested range ?*/ + if (cur_blk >= o_end) + goto out; + } else { /* in_range(o_start, o_blk, o_len) */ + cur_len += cur_blk - o_start; + } + unwritten = ext4_ext_is_unwritten(ex); + if (o_end - o_start < cur_len) + cur_len = o_end - o_start; + + orig_page_index = o_start >> (PAGE_SHIFT - + orig_inode->i_blkbits); + donor_page_index = d_start >> (PAGE_SHIFT - + donor_inode->i_blkbits); + offset_in_page = o_start % blocks_per_page; + if (cur_len > blocks_per_page - offset_in_page) + cur_len = blocks_per_page - offset_in_page; + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->read_folio, ->write_begin, and + * ext4_get_block in move_extent_per_page + */ + ext4_double_up_write_data_sem(orig_inode, donor_inode); + /* Swap original branches with new branches */ + *moved_len += move_extent_per_page(o_filp, donor_inode, + orig_page_index, donor_page_index, + offset_in_page, cur_len, + unwritten, &ret); + ext4_double_down_write_data_sem(orig_inode, donor_inode); + if (ret < 0) + break; + o_start += cur_len; + d_start += cur_len; + } + +out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + + ext4_free_ext_path(path); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + unlock_two_nondirectories(orig_inode, donor_inode); + + return ret; +} diff --git a/fs/ext4l/orphan.c b/fs/ext4l/orphan.c new file mode 100644 index 00000000000..82d5e750145 --- /dev/null +++ b/fs/ext4l/orphan.c @@ -0,0 +1,659 @@ +/* + * Ext4 orphan inode handling + */ +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) +{ + int i, j, start; + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + int ret = 0; + bool found = false; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int looped = 0; + + /* + * Find block with free orphan entry. Use CPU number for a naive hash + * for a search start in the orphan file + */ + start = raw_smp_processor_id()*13 % oi->of_blocks; + i = start; + do { + if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) + >= 0) { + found = true; + break; + } + if (++i >= oi->of_blocks) + i = 0; + } while (i != start); + + if (!found) { + /* + * For now we don't grow or shrink orphan file. We just use + * whatever was allocated at mke2fs time. The additional + * credits we would have to reserve for each orphan inode + * operation just don't seem worth it. + */ + return -ENOSPC; + } + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return ret; + } + + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + /* Find empty slot in a block */ + j = 0; + do { + if (looped) { + /* + * Did we walk through the block several times without + * finding free entry? It is theoretically possible + * if entries get constantly allocated and freed or + * if the block is corrupted. Avoid indefinite looping + * and bail. We'll use orphan list instead. + */ + if (looped > 3) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return -ENOSPC; + } + cond_resched(); + } + while (bdata[j]) { + if (++j >= inodes_per_ob) { + j = 0; + looped++; + } + } + } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != + (__le32)0); + + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + + return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); +} + +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_rwsem unless + * we are just creating the inode or deleting it. + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_iloc iloc; + int err = 0, rc; + bool dirty = false; + + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + if (ext4_inode_orphan_tracked(inode)) + return 0; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_rwsem, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + if (sbi->s_orphan_info.of_blocks) { + err = ext4_orphan_file_add(handle, inode); + /* + * Fallback to normal orphan list of orphan file is + * out of space + */ + if (err != -ENOSPC) + return err; + } + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + mutex_lock(&sbi->s_orphan_lock); + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del_init(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } else + brelse(iloc.bh); + + ext4_debug("superblock will point to %lu\n", inode->i_ino); + ext4_debug("orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out: + ext4_std_error(sb, err); + return err; +} + +static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) +{ + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + __le32 *bdata; + int blk, off; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int ret = 0; + + if (!handle) + goto out; + blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; + off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; + if (WARN_ON_ONCE(blk >= oi->of_blocks)) + goto out; + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) + goto out; + + bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); + bdata[off] = 0; + atomic_inc(&oi->of_binfo[blk].ob_free_entries); + ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); + + return ret; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) + return ext4_orphan_file_del(handle, inode); + + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; + + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + + mutex_lock(&sbi->s_orphan_lock); + ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); + + prev = ei->i_orphan.prev; + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_err; + } + + ino_next = NEXT_ORPHAN(inode); + if (prev == &sbi->s_orphan) { + ext4_debug("superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + sbi->s_sbh, EXT4_JTR_NONE); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + ext4_superblock_csum_set(inode->i_sb); + unlock_buffer(sbi->s_sbh); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + ext4_debug("orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out_err: + ext4_std_error(inode->i_sb, err); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +#ifdef CONFIG_QUOTA +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, + rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], + lockdep_is_held(&sb->s_umount)), + EXT4_SB(sb)->s_jquota_fmt, type); +} +#endif + +static void ext4_process_orphan(struct inode *inode, + int *nr_truncates, int *nr_orphans) +{ + struct super_block *sb = inode->i_sb; + int ret; + + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + ext4_debug("truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + inode_lock(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ret = ext4_truncate(inode); + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); + ext4_std_error(inode->i_sb, ret); + } + inode_unlock(inode); + (*nr_truncates)++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + ext4_debug("deleting unreferenced inode %lu\n", + inode->i_ino); + (*nr_orphans)++; + } + iput(inode); /* The delete magic happens here! */ +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + struct inode *inode; + int i, j; +#ifdef CONFIG_QUOTA + int quota_update = 0; +#endif + __le32 *bdata; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!es->s_last_orphan && !oi->of_blocks) { + ext4_debug("no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " + "clearing orphan list."); + es->s_last_orphan = 0; + } + ext4_debug("Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & SB_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~SB_RDONLY; + } +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: type %d: error %d", i, ret); + } + } +#endif + + while (es->s_last_orphan) { + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_debug("Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + + for (i = 0; i < oi->of_blocks; i++) { + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + for (j = 0; j < inodes_per_ob; j++) { + if (!bdata[j]) + continue; + inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); + if (IS_ERR(inode)) + continue; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } + } +#endif + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ +} + +void ext4_release_orphan_info(struct super_block *sb) +{ + int i; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + + if (!oi->of_blocks) + return; + for (i = 0; i < oi->of_blocks; i++) + brelse(oi->of_binfo[i].ob_bh); + kvfree(oi->of_binfo); +} + +static struct ext4_orphan_block_tail *ext4_orphan_block_tail( + struct super_block *sb, + struct buffer_head *bh) +{ + return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - + sizeof(struct ext4_orphan_block_tail)); +} + +static int ext4_orphan_file_block_csum_verify(struct super_block *sb, + struct buffer_head *bh) +{ + __u32 calculated; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + ot = ext4_orphan_block_tail(sb, bh); + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, + inodes_per_ob * sizeof(__u32)); + return le32_to_cpu(ot->ob_checksum) == calculated; +} + +/* This gets called only when checksumming is enabled */ +void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct super_block *sb = EXT4_TRIGGER(triggers)->sb; + __u32 csum; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); + ot = ext4_orphan_block_tail(sb, bh); + ot->ob_checksum = cpu_to_le32(csum); +} + +int ext4_init_orphan_info(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct inode *inode; + int i, j; + int ret; + int free; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_block_tail *ot; + ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); + + if (!ext4_has_feature_orphan_file(sb)) + return 0; + + inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) { + ext4_msg(sb, KERN_ERR, "get orphan inode failed"); + return PTR_ERR(inode); + } + /* + * This is just an artificial limit to prevent corrupted fs from + * consuming absurd amounts of memory when pinning blocks of orphan + * file in memory. + */ + if (inode->i_size > 8 << 20) { + ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto out_put; + } + oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; + oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; + oi->of_binfo = kvmalloc_array(oi->of_blocks, + sizeof(struct ext4_orphan_block), + GFP_KERNEL); + if (!oi->of_binfo) { + ret = -ENOMEM; + goto out_put; + } + for (i = 0; i < oi->of_blocks; i++) { + oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); + if (IS_ERR(oi->of_binfo[i].ob_bh)) { + ret = PTR_ERR(oi->of_binfo[i].ob_bh); + goto out_free; + } + if (!oi->of_binfo[i].ob_bh) { + ret = -EIO; + goto out_free; + } + ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); + if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { + ext4_error(sb, "orphan file block %d: bad magic", i); + ret = -EIO; + goto out_free; + } + if (!ext4_orphan_file_block_csum_verify(sb, + oi->of_binfo[i].ob_bh)) { + ext4_error(sb, "orphan file block %d: bad checksum", i); + ret = -EIO; + goto out_free; + } + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + free = 0; + for (j = 0; j < inodes_per_ob; j++) + if (bdata[j] == 0) + free++; + atomic_set(&oi->of_binfo[i].ob_free_entries, free); + } + iput(inode); + return 0; +out_free: + for (i--; i >= 0; i--) + brelse(oi->of_binfo[i].ob_bh); + kvfree(oi->of_binfo); +out_put: + iput(inode); + return ret; +} + +int ext4_orphan_file_empty(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int i; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!ext4_has_feature_orphan_file(sb)) + return 1; + for (i = 0; i < oi->of_blocks; i++) + if (atomic_read(&oi->of_binfo[i].ob_free_entries) != + inodes_per_ob) + return 0; + return 1; +} diff --git a/fs/ext4l/page-io.c b/fs/ext4l/page-io.c new file mode 100644 index 00000000000..39abfeec5f3 --- /dev/null +++ b/fs/ext4l/page-io.c @@ -0,0 +1,593 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; + +int __init ext4_init_pageio(void) +{ + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_end_cachep == NULL) + return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", + bh->b_bdev, + (unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct folio *io_folio = NULL; + struct buffer_head *bh, *head; + size_t bio_start = fi.offset; + size_t bio_end = bio_start + fi.length; + unsigned under_io = 0; + unsigned long flags; + + if (fscrypt_is_bounce_folio(folio)) { + io_folio = folio; + folio = fscrypt_pagecache_folio(folio); + } + + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + mapping_set_error(folio->mapping, err); + } + bh = head = folio_buffers(folio); + /* + * We check all buffers in the folio under b_uptodate_lock + * to avoid races with other end io clearing async_write flags + */ + spin_lock_irqsave(&head->b_uptodate_lock, flags); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (bio->bi_status) { + set_buffer_write_io_error(bh); + buffer_io_error(bh); + } + } while ((bh = bh->b_this_page) != head); + spin_unlock_irqrestore(&head->b_uptodate_lock, flags); + if (!under_io) { + fscrypt_free_bounce_page(&io_folio->page); + folio_end_writeback(folio); + } + } +} + +static void ext4_release_io_end(ext4_io_end_t *io_end) +{ + struct bio *bio, *next_bio; + + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } + ext4_free_io_end_vec(io_end); + kmem_cache_free(io_end_cachep, io_end); +} + +/* + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ +static int ext4_end_io_end(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); + + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, error %d)", inode->i_ino, ret); + } + + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); + return ret; +} + +static void dump_completed_IO(struct inode *inode, struct list_head *head) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io_end, *io_end0, *io_end1; + + if (list_empty(head)) + return; + + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; + before = cur->prev; + io_end0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io_end1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io_end, inode->i_ino, io_end0, io_end1); + } +#endif +} + +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec)) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED && + !ext4_emergency_state(io_end->inode->i_sb)) + return true; + return false; +} + +/* Add the io_end to per-inode completed end_io list. */ +static void ext4_add_complete_io(ext4_io_end_t *io_end) +{ + struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); + struct workqueue_struct *wq; + unsigned long flags; + + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + WARN_ON(!io_end->bio); + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + wq = sbi->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} + +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) +{ + ext4_io_end_t *io_end; + struct list_head unwritten; + unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, ret = 0; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + while (!list_empty(&unwritten)) { + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + list_del_init(&io_end->list); + + err = ext4_end_io_end(io_end); + if (unlikely(!ret && err)) + ret = err; + } + return ret; +} + +/* + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. + */ +void ext4_end_io_rsv_work(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); + refcount_set(&io_end->count, 1); + } + return io_end; +} + +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (refcount_dec_and_test(&io_end->count)) { + if (ext4_io_end_defer_completion(io_end)) + return ext4_add_complete_io(io_end); + + ext4_release_io_end(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + if (refcount_dec_and_test(&io_end->count)) { + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + + ext4_release_io_end(io_end); + } + return 0; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + refcount_inc(&io_end->count); + return io_end; +} + +/* BIO completion function for page writeback */ +static void ext4_end_bio(struct bio *bio) +{ + ext4_io_end_t *io_end = bio->bi_private; + sector_t bi_sector = bio->bi_iter.bi_sector; + + if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n", + bio->bi_bdev, + (long long) bio->bi_iter.bi_sector, + (unsigned) bio_sectors(bio), + bio->bi_status)) { + ext4_finish_bio(bio); + bio_put(bio); + return; + } + bio->bi_end_io = NULL; + + if (bio->bi_status) { + struct inode *inode = io_end->inode; + + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " + "starting block %llu)", + bio->bi_status, inode->i_ino, + (unsigned long long) + bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); + } + + if (ext4_io_end_defer_completion(io_end)) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); + } +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + if (io->io_wbc->sync_mode == WB_SYNC_ALL) + io->io_bio->bi_opf |= REQ_SYNC; + submit_bio(io->io_bio); + } + io->io_bio = NULL; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_wbc = wbc; + io->io_bio = NULL; + io->io_end = NULL; +} + +static void io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) +{ + struct bio *bio; + + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). + */ + bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); + fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_end_io = ext4_end_bio; + bio->bi_private = ext4_get_io_end(io->io_end); + io->io_bio = bio; + io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); +} + +static void io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct folio *folio, + struct folio *io_folio, + struct buffer_head *bh) +{ + if (io->io_bio && (bh->b_blocknr != io->io_next_block || + !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + io_submit_init_bio(io, bh); + io->io_bio->bi_write_hint = inode->i_write_hint; + } + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) + goto submit_and_retry; + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); + io->io_next_block++; +} + +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) +{ + struct folio *io_folio = folio; + struct inode *inode = folio->mapping->host; + unsigned block_start; + struct buffer_head *bh, *head; + int ret = 0; + int nr_to_submit = 0; + struct writeback_control *wbc = io->io_wbc; + bool keep_towrite = false; + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); + + /* + * Comments copied from block_write_full_folio: + * + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (len < folio_size(folio)) + folio_zero_segment(folio, len, folio_size(folio)); + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the folio before submitting so that + * folio_end_writeback() cannot be called from ext4_end_bio() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = folio_buffers(folio); + do { + block_start = bh_offset(bh); + if (block_start >= len) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + if (!buffer_dirty(bh) || buffer_delay(bh) || + !buffer_mapped(bh) || buffer_unwritten(bh)) { + /* A hole? We can safely clear the dirty bit */ + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + /* + * Keeping dirty some buffer we cannot write? Make sure + * to redirty the folio and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the folio. + * This happens e.g. when doing writeout for + * transaction commit or when journalled data is not + * yet committed. + */ + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { + if (!folio_test_dirty(folio)) + folio_redirty_for_writepage(wbc, folio); + keep_towrite = true; + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + set_buffer_async_write(bh); + clear_buffer_dirty(bh); + nr_to_submit++; + } while ((bh = bh->b_this_page) != head); + + /* Nothing to submit? Just unlock the folio... */ + if (!nr_to_submit) + return 0; + + bh = head = folio_buffers(folio); + + /* + * If any blocks are being written to an encrypted file, encrypt them + * into a bounce page. For simplicity, just encrypt until the last + * block which might be needed. This may cause some unneeded blocks + * (e.g. holes) to be unnecessarily encrypted, but this is rare and + * can't happen in the common case of blocksize == PAGE_SIZE. + */ + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { + gfp_t gfp_flags = GFP_NOFS; + unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + struct page *bounce_page; + + /* + * Since bounce page allocation uses a mempool, we can only use + * a waiting mask (i.e. request guaranteed allocation) on the + * first page of the bio. Otherwise it can deadlock. + */ + if (io->io_bio) + gfp_flags = GFP_NOWAIT; + retry_encrypt: + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, + enc_bytes, 0, gfp_flags); + if (IS_ERR(bounce_page)) { + ret = PTR_ERR(bounce_page); + if (ret == -ENOMEM && + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { + gfp_t new_gfp_flags = GFP_NOFS; + if (io->io_bio) + ext4_io_submit(io); + else + new_gfp_flags |= __GFP_NOFAIL; + memalloc_retry_wait(gfp_flags); + gfp_flags = new_gfp_flags; + goto retry_encrypt; + } + + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + folio_redirty_for_writepage(wbc, folio); + do { + if (buffer_async_write(bh)) { + clear_buffer_async_write(bh); + set_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while (bh != head); + + return ret; + } + io_folio = page_folio(bounce_page); + } + + __folio_start_writeback(folio, keep_towrite); + + /* Now submit buffers to write */ + do { + if (!buffer_async_write(bh)) + continue; + io_submit_add_bh(io, inode, folio, io_folio, bh); + } while ((bh = bh->b_this_page) != head); + + return 0; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy resize.c, symlink.c, sysfs.c, truncate.h, verity.c, and readpage.c from Linux v6.18 fs/ext4 directory. - resize: online filesystem resize support - symlink: symbolic link operations - sysfs: sysfs attribute interface - truncate.h: truncate-related declarations - verity: fs-verity support - readpage: page reading operations Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/readpage.c | 422 +++++++++ fs/ext4l/resize.c | 2193 +++++++++++++++++++++++++++++++++++++++++++ fs/ext4l/symlink.c | 136 +++ fs/ext4l/sysfs.c | 648 +++++++++++++ fs/ext4l/truncate.h | 52 + fs/ext4l/verity.c | 399 ++++++++ 6 files changed, 3850 insertions(+) create mode 100644 fs/ext4l/readpage.c create mode 100644 fs/ext4l/resize.c create mode 100644 fs/ext4l/symlink.c create mode 100644 fs/ext4l/sysfs.c create mode 100644 fs/ext4l/truncate.h create mode 100644 fs/ext4l/verity.c diff --git a/fs/ext4l/readpage.c b/fs/ext4l/readpage.c new file mode 100644 index 00000000000..f329daf6e5c --- /dev/null +++ b/fs/ext4l/readpage.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/readpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2015, Google, Inc. + * + * This was originally taken from fs/mpage.c + * + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for + * encrypted files. It has some limitations (see below), where it + * will fall back to read_block_full_page(), but these limitations + * should only be hit when page_size != block_size. + * + * This will allow us to attach a callback function to support ext4 + * encryption. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_SIZE setups. + * + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/kdev_t.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/highmem.h> +#include <linux/prefetch.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> + +#include "ext4.h" + +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, + STEP_VERITY, + STEP_MAX, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); +} + +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + + /* + * fsverity_verify_bio() may call readahead() again, and although verity + * will be disabled for that, decryption may still be needed, causing + * another bio_post_read_ctx to be allocated. So to guarantee that + * mempool_alloc() never deadlocks we must free the current ctx first. + * This is safe because verity is the last post-read step. + */ + BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + + fsverity_verify_bio(bio); + + __read_end_io(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + fallthrough; + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + fallthrough; + default: + __read_end_io(ctx->bio); + } +} + +static bool bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; +} + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_folio(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio) +{ + if (bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; + + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; + } + __read_end_io(bio); +} + +static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +static void ext4_set_bio_post_read_ctx(struct bio *bio, + const struct inode *inode, + pgoff_t first_idx) +{ + unsigned int post_read_steps = 0; + + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + post_read_steps |= 1 << STEP_DECRYPT; + + if (ext4_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + + if (post_read_steps) { + /* Due to the mempool, this never fails. */ + struct bio_post_read_ctx *ctx = + mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; + } +} + +static inline loff_t ext4_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); +} + +int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct folio *folio) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t next_block; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t first_block; + unsigned page_block; + struct block_device *bdev = inode->i_sb->s_bdev; + int length; + unsigned relative_block = 0; + struct ext4_map_blocks map; + unsigned int nr_pages, folio_pages; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { + int fully_mapped = 1; + unsigned int first_hole; + unsigned int blocks_per_folio; + + if (rac) + folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + prefetchw(&folio->flags); + + if (folio_buffers(folio)) + goto confused; + + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; + block_in_file = next_block = + (sector_t)folio->index << (PAGE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (ext4_readpage_limit(inode) + + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & EXT4_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) { + unsigned map_offset = block_in_file - map.m_lblk; + unsigned last = map.m_len - map_offset; + + first_block = map.m_pblk + map_offset; + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } + if (page_block == blocks_per_folio) + break; + page_block++; + block_in_file++; + } + } + + /* + * Then do more ext4_map_blocks() calls until we are + * done with this folio. + */ + while (page_block < blocks_per_folio) { + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { + set_error_page: + folio_zero_segment(folio, 0, + folio_size(folio)); + folio_unlock(folio); + goto next_page; + } + } + if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { + fully_mapped = 0; + if (first_hole == blocks_per_folio) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + if (first_hole != blocks_per_folio) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (!page_block) + first_block = map.m_pblk; + else if (first_block + page_block != map.m_pblk) + goto confused; + for (relative_block = 0; ; relative_block++) { + if (relative_block == map.m_len) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } else if (page_block == blocks_per_folio) + break; + page_block++; + block_in_file++; + } + } + if (first_hole != blocks_per_folio) { + folio_zero_segment(folio, first_hole << blkbits, + folio_size(folio)); + if (first_hole == 0) { + if (ext4_need_verity(inode, folio->index) && + !fsverity_verify_folio(folio)) + goto set_error_page; + folio_end_read(folio, true); + continue; + } + } else if (fully_mapped) { + folio_set_mappedtodisk(folio); + } + + /* + * This folio will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != first_block - 1 || + !fscrypt_mergeable_bio(bio, inode, next_block))) { + submit_and_realloc: + submit_bio(bio); + bio = NULL; + } + if (bio == NULL) { + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). + */ + bio = bio_alloc(bdev, bio_max_segs(nr_pages), + REQ_OP_READ, GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, next_block, + GFP_KERNEL); + ext4_set_bio_post_read_ctx(bio, inode, folio->index); + bio->bi_iter.bi_sector = first_block << (blkbits - 9); + bio->bi_end_io = mpage_end_io; + if (rac) + bio->bi_opf |= REQ_RAHEAD; + } + + length = first_hole << blkbits; + if (!bio_add_folio(bio, folio, length, 0)) + goto submit_and_realloc; + + if (((map.m_flags & EXT4_MAP_BOUNDARY) && + (relative_block == map.m_len)) || + (first_hole != blocks_per_folio)) { + submit_bio(bio); + bio = NULL; + } else + last_block_in_bio = first_block + blocks_per_folio - 1; + continue; + confused: + if (bio) { + submit_bio(bio); + bio = NULL; + } + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, ext4_get_block); + else + folio_unlock(folio); +next_page: + ; /* A label shall be followed by a statement until C23 */ + } + if (bio) + submit_bio(bio); + return 0; +} + +int __init ext4_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); + + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void ext4_exit_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/ext4l/resize.c b/fs/ext4l/resize.c new file mode 100644 index 00000000000..050f26168d9 --- /dev/null +++ b/fs/ext4l/resize.c @@ -0,0 +1,2193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/resize.c + * + * Support for resizing an ext4 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> + * + * This could probably be made into a module, because it is not often in use. + */ + + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/jiffies.h> + +#include "ext4_jbd2.h" + +struct ext4_rcu_ptr { + struct rcu_head rcu; + void *ptr; +}; + +static void ext4_rcu_ptr_callback(struct rcu_head *head) +{ + struct ext4_rcu_ptr *ptr; + + ptr = container_of(head, struct ext4_rcu_ptr, rcu); + kvfree(ptr->ptr); + kfree(ptr); +} + +void ext4_kvfree_array_rcu(void *to_free) +{ + struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + + if (ptr) { + ptr->ptr = to_free; + call_rcu(&ptr->rcu, ext4_rcu_ptr_callback); + return; + } + synchronize_rcu(); + kvfree(to_free); +} + +int ext4_resize_begin(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * If the reserved GDT blocks is non-zero, the resize_inode feature + * should always be set. + */ + if (sbi->s_es->s_reserved_gdt_blocks && + !ext4_has_feature_resize_inode(sb)) { + ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); + return -EFSCORRUPTED; + } + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != + le32_to_cpu(sbi->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)sbi->s_sbh->b_blocknr); + return -EPERM; + } + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (sbi->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed"); + return -EPERM; + } + + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } + + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &sbi->s_ext4_flags)) + ret = -EBUSY; + + return ret; +} + +int ext4_resize_end(struct super_block *sb, bool update_backups) +{ + clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); + smp_mb__after_atomic(); + if (update_backups) + return ext4_update_overhead(sb, true); + return 0; +} + +static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, + ext4_group_t group) { + ext4_grpblk_t overhead; + overhead = ext4_bg_num_gdb(sb, group); + if (ext4_bg_has_super(sb, group)) + overhead += 1 + + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + return overhead; +} + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t start = ext4_blocks_count(es); + ext4_fsblk_t end = start + input->blocks_count; + ext4_group_t group = input->group; + ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead; + ext4_fsblk_t metaend; + struct buffer_head *bh = NULL; + ext4_grpblk_t free_blocks_count, offset; + int err = -EINVAL; + + if (group != sbi->s_groups_count) { + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + return -EINVAL; + } + + overhead = ext4_group_overhead_blocks(sb, group); + metaend = start + overhead; + free_blocks_count = input->blocks_count - 2 - overhead - + sbi->s_itb_per_group; + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext4_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + ext4_get_group_no_and_offset(sb, start, NULL, &offset); + if (offset != 0) + ext4_warning(sb, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext4_warning(sb, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext4_warning(sb, "Bad blocks count %u", + input->blocks_count); + else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { + err = PTR_ERR(bh); + bh = NULL; + ext4_warning(sb, "Cannot read last block (%llu)", + end - 1); + } else if (outside(input->block_bitmap, start, end)) + ext4_warning(sb, "Block bitmap not in group (block %llu)", + (unsigned long long)input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext4_warning(sb, "Inode bitmap not in group (block %llu)", + (unsigned long long)input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", + (unsigned long long)input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", + (unsigned long long)input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->block_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->inode_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->block_bitmap, start, metaend)) + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->block_bitmap, + start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", + (unsigned long long)input->inode_table, + itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t resize_bg; /* number of allocated + new_group_data */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * Avoiding memory allocation failures due to too many groups added each time. + */ +#define MAX_RESIZE_BG 16384 + +/* + * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the + * resizing from @o_group to @n_group, its size is typically @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, + ext4_group_t o_group, ext4_group_t n_group) +{ + ext4_group_t last_group; + unsigned int max_resize_bg; + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG); + flex_gd->resize_bg = max_resize_bg; + + /* Avoid allocating large 'groups' array if not needed */ + last_group = o_group | (flex_gd->resize_bg - 1); + if (n_group <= last_group) + flex_gd->resize_bg = 1 << fls(n_group - o_group); + else if (n_group - last_group < flex_gd->resize_bg) + flex_gd->resize_bg = 1 << max(fls(last_group - o_group), + fls(n_group - last_group)); + + if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg)) + flex_gd->resize_bg = max_resize_bg; + + flex_gd->groups = kmalloc_array(flex_gd->resize_bg, + sizeof(struct ext4_new_group_data), + GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), + GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + * + * Returns 0 on a successful allocation of the metadata blocks in the + * block group. + */ +static int ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + unsigned int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + if (src_group >= group_data[0].group + flex_gd->count) + return -ENOSPC; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_group_overhead_blocks(sb, src_group); + + start_blk += overhead; + + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) { + overhead = ext4_group_overhead_blocks(sb, src_group); + if (overhead == 0) + last_blk += group_data[src_group - group].blocks_count; + else + break; + } + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + group = ext4_get_group_number(sb, start_blk - 1); + group -= group_data[0].group; + group_data[group].mdata_blocks++; + flex_gd->bg_flags[group] &= uninit_mask; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + group = ext4_get_group_number(sb, start_blk - 1); + group -= group_data[0].group; + group_data[group].mdata_blocks++; + flex_gd->bg_flags[group] &= uninit_mask; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); + group -= group_data[0].group; + + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].mdata_blocks += overhead; + itb -= overhead; + } + + group_data[group].mdata_blocks += itb; + flex_gd->bg_flags[group] &= uninit_mask; + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + /* Update free clusters count to exclude metadata blocks */ + for (i = 0; i < flex_gd->count; i++) { + group_data[i].free_clusters_count -= + EXT4_NUM_B2C(EXT4_SB(sb), + group_data[i].mdata_blocks); + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%u groups, flexbg size is %u:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + ext4_debug( + "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_clusters_count, + group_data[i].mdata_blocks); + } + } + return 0; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext4_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) { + brelse(bh); + bh = ERR_PTR(err); + } else { + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + } + + return bh; +} + +static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) +{ + return ext4_journal_ensure_credits_fn(handle, credits, + EXT4_MAX_TRANS_DATA, 0, 0); +} + +/* + * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t count = last_cluster - first_cluster + 1; + ext4_group_t count2; + + ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, + last_cluster); + for (; count > 0; count -= count2, first_cluster += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); + start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); + group -= flex_gd->groups[0].group; + + count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (unlikely(!bh)) + return -ENOMEM; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE); + if (err) { + brelse(bh); + return err; + } + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", + first_cluster, first_cluster - start, count2); + mb_set_bits(bh->b_data, first_cluster - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); + if (unlikely(err)) + return err; + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. + */ +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; + handle_t *handle; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + int meta_bg; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + meta_bg = ext4_has_feature_meta_bg(sb); + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; + ext4_grpblk_t overhead; + + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); + + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) + goto handle_itb; + + if (meta_bg == 1) + goto handle_itb; + + block = start + ext4_bg_has_super(sb, group); + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0; j < gdblocks; j++, block++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx\n", block); + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) + goto out; + + gdb = sb_getblk(sb, block); + if (unlikely(!gdb)) { + err = -ENOMEM; + goto out; + } + + BUFFER_TRACE(gdb, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdb, + EXT4_JTR_NONE); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi_array_rcu_deref(sbi, + s_group_desc, j)->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; + } + +handle_itb: + /* Initialize group tables of the group @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; + + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; + +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; + + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) + goto out; + + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + overhead = ext4_group_overhead_blocks(sb, group); + if (overhead != 0) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + mb_set_bits(bh->b_data, 0, + EXT4_NUM_B2C(sbi, overhead)); + } + ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); + if (err) + goto out; + +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; + + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); + if (err) + goto out; + } + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; + count = group_table_count[j]; + start = (&group_data[i].block_bitmap)[j]; + block = start; + } + + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; + } + +out: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, + unsigned int *five, unsigned int *seven) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned int *min = three; + int mult = 3; + unsigned int ret; + + if (ext4_has_feature_sparse_super2(sb)) { + do { + if (*min > 2) + return UINT_MAX; + ret = le32_to_cpu(es->s_backup_bgs[*min - 1]); + *min += 1; + } while (!ret); + return ret; + } + + if (!ext4_has_feature_sparse_super(sb)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, + struct buffer_head *primary) +{ + const ext4_fsblk_t blk = primary->b_blocknr; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != + grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ + ext4_warning(sb, "reserved GDT %llu" + " missing grp %d (%llu)", + blk, grp, + grp * + (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + blk); + return -EINVAL; + } + if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc = NULL; + struct buffer_head *dind = NULL; + struct buffer_head *gdb_bh = NULL; + int gdbackups; + struct ext4_iloc iloc = { .bh = NULL }; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", + gdb_num); + + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); + + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); + if (gdbackups < 0) { + err = gdbackups; + goto errout; + } + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto errout; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext4_warning(sb, "new group %u GDT block %llu not reserved", + group, gdblock); + err = -EINVAL; + goto errout; + } + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); + if (unlikely(err)) + goto errout; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); + if (unlikely(err)) + goto errout; + + BUFFER_TRACE(dind, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto errout; + } + + /* ext4_reserve_inode_write() gets a reference on the iloc */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) + goto errout; + + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + goto errout; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto errout; + } + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> + (9 - EXT4_SB(sb)->s_cluster_bits); + ext4_mark_iloc_dirty(handle, inode, &iloc); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + iloc.bh = NULL; + goto errout; + } + brelse(dind); + + rcu_read_lock(); + o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + rcu_read_unlock(); + n_group_desc[gdb_num] = gdb_bh; + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree_array_rcu(o_group_desc); + + lock_buffer(EXT4_SB(sb)->s_sbh); + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); + return err; +errout: + kvfree(n_group_desc); + brelse(iloc.bh); + brelse(dind); + brelse(gdb_bh); + + ext4_debug("leaving with error %d\n", err); + return err; +} + +/* + * If there is no available space in the existing block group descriptors for + * the new block group and there are no reserved block group descriptors, then + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set + * to the first block group that is managed using meta_bg and s_first_meta_bg + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). + * This function will be called when first group of meta_bg is added to bring + * new group descriptors block of new added meta_bg. + */ +static int add_new_gdb_meta_bg(struct super_block *sb, + handle_t *handle, ext4_group_t group) { + ext4_fsblk_t gdblock; + struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int err; + + gdblock = ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); + if (!n_group_desc) { + brelse(gdb_bh); + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + return err; + } + + rcu_read_lock(); + o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + rcu_read_unlock(); + n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree_array_rcu(o_group_desc); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + int cluster_bits = EXT4_SB(sb)->s_cluster_bits; + struct buffer_head **primary; + struct buffer_head *dind; + struct ext4_iloc iloc; + ext4_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc_array(reserved_gdb, sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto exit_free; + } + + blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext4_warning(sb, "reserved block %llu" + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = ext4_sb_bread(sb, blk, 0); + if (IS_ERR(primary[res])) { + err = PTR_ERR(primary[res]); + primary[res] = NULL; + goto exit_bh; + } + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + BUFFER_TRACE(primary[i], "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, sb, primary[i], + EXT4_JTR_NONE))) + goto exit_bh; + } + + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = group * EXT4_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); + if (!err) + err = err2; + } + + inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); + ext4_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, + ext4_group_t group) +{ + struct ext4_super_block *es = (struct ext4_super_block *) data; + + es->s_block_group_nr = cpu_to_le16(group); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); +} + +/* + * Update the backup copies of the ext4 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, sector_t blk_off, char *data, + int size, int meta_bg) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t last; + const int bpg = EXT4_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + ext4_group_t group = 0; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + if (meta_bg == 0) { + group = ext4_list_backups(sb, &three, &five, &seven); + last = sbi->s_groups_count; + } else { + group = ext4_get_group_number(sb, blk_off) + 1; + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); + } + + while (group < sbi->s_groups_count) { + struct buffer_head *bh; + ext4_fsblk_t backup_block; + int has_super = ext4_bg_has_super(sb, group); + ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); + + /* Out of journal space, and can't get more - abort - so sad */ + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) + break; + + if (meta_bg == 0) + backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; + else + backup_block = first_block + has_super; + + bh = sb_getblk(sb, backup_block); + if (unlikely(!bh)) { + err = -ENOMEM; + break; + } + ext4_debug("update metadata backup %llu(+%llu)\n", + backup_block, backup_block - + ext4_group_first_block_no(sb, group)); + BUFFER_TRACE(bh, "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE))) { + brelse(bh); + break; + } + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + if (has_super && (backup_block == first_block)) + ext4_set_block_group_nr(sb, bh->b_data, group); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); + brelse(bh); + + if (meta_bg == 0) + group = ext4_list_backups(sb, &three, &five, &seven); + else if (group == last) + break; + else + group = last; + } + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext4_warning(sb, "can't update backup for group %u (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT4_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + int meta_bg; + + meta_bg = ext4_has_feature_meta_bg(sb); + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, + gdb_num); + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, + EXT4_JTR_NONE); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else if (meta_bg != 0) { + err = add_new_gdb_meta_bg(sb, handle, group); + } else { + err = add_new_gdb(handle, resize_inode, group); + } + if (err) + break; + } + return err; +} + +static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) +{ + struct buffer_head *bh = sb_getblk(sb, block); + if (unlikely(!bh)) + return NULL; + if (!bh_uptodate_or_lock(bh)) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { + brelse(bh); + return NULL; + } + } + + return bh; +} + +static int ext4_set_bitmap_checksums(struct super_block *sb, + struct ext4_group_desc *gdp, + struct ext4_new_group_data *group_data) +{ + struct buffer_head *bh; + + if (!ext4_has_feature_metadata_csum(sb)) + return 0; + + bh = ext4_get_bitmap(sb, group_data->inode_bitmap); + if (!bh) + return -EIO; + ext4_inode_bitmap_csum_set(sb, gdp, bh); + brelse(bh); + + bh = ext4_get_bitmap(sb, group_data->block_bitmap); + if (!bh) + return -EIO; + ext4_block_bitmap_csum_set(sb, gdp, bh); + brelse(bh); + + return 0; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + err = ext4_set_bitmap_checksums(sb, gdp, group_data); + if (err) { + ext4_std_error(sb, err); + break; + } + + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + group_data->free_clusters_count); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + if (ext4_has_group_desc_csum(sb)) + ext4_itable_unused_set(sb, gdp, + EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + ext4_group_desc_csum_set(sb, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +static void ext4_add_overhead(struct super_block *sb, + const ext4_fsblk_t overhead) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sbi->s_overhead += overhead; + es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + smp_wmb(); +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + lock_buffer(sbi->s_sbh); + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_NUM_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + ext4_debug("free blocks count %llu", + percpu_counter_read(&sbi->s_freeclusters_counter)); + if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + struct flex_groups *fg; + + flex_group = ext4_flex_group(sbi, group_data[0].group); + fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &fg->free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &fg->free_inodes); + } + + /* + * Update the fs overhead information. + * + * For bigalloc, if the superblock already has a properly calculated + * overhead, update it with a value based on numbers already computed + * above for the newly allocated capacity. + */ + if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0)) + ext4_add_overhead(sb, + EXT4_NUM_B2C(sbi, blocks_count - free_blocks)); + else + ext4_calculate_overhead(sb); + es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * blocks. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != sbi->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int gdb_num_end = ((group + flex_gd->count - 1) / + EXT4_DESC_PER_BLOCK(sb)); + int meta_bg = ext4_has_feature_meta_bg(sb) && + gdb_num >= le32_to_cpu(es->s_first_meta_bg); + sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - + ext4_group_first_block_no(sb, 0); + + update_backups(sb, ext4_group_first_block_no(sb, 0), + (char *)es, sizeof(struct ext4_super_block), 0); + for (; gdb_num <= gdb_num_end; gdb_num++) { + struct buffer_head *gdb_bh; + + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, + gdb_num); + update_backups(sb, gdb_bh->b_blocknr - padding_blocks, + gdb_bh->b_data, gdb_bh->b_size, meta_bg); + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t clusters_per_group; + unsigned long i; + + clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flex_gd->resize_bg - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); + overhead = ext4_group_overhead_blocks(sb, group + i); + group_data[i].mdata_blocks = overhead; + group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) { + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + if (!test_opt(sb, INIT_INODE_TABLE)) + flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; + } else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && ext4_has_group_desc_csum(sb)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != clusters_per_group - 1)) { + group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); + group_data[i - 1].free_clusters_count -= clusters_per_group - + last - 1; + } + + return 1; +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) +{ + struct ext4_new_flex_group_data flex_gd; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct inode *inode = NULL; + int gdb_off; + int err; + __u16 bg_flags = 0; + + gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) { + ext4_warning(sb, "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (ext4_blocks_count(es) + input->blocks_count < + ext4_blocks_count(es)) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext4_warning(sb, "inodes_count overflow"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!ext4_has_feature_resize_inode(sb) || + !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext4_warning(sb, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + + err = verify_group_input(sb, input); + if (err) + goto out; + + err = ext4_alloc_flex_bg_array(sb, input->group + 1); + if (err) + goto out; + + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); + if (err) + goto out; + + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ + +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, "error %d on journal start", err); + return err; + } + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; + } + + lock_buffer(EXT4_SB(sb)->s_sbh); + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); + if (err) + goto errout; + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, ext4_group_first_block_no(sb, 0), + (char *)es, sizeof(struct ext4_super_block), 0); + } + return err; +} + +/* + * Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext4_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count) +{ + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_grpblk_t add; + struct buffer_head *bh; + ext4_group_t group; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + /* Handle the remaining blocks in the last group only. */ + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + + if (last == 0) { + ext4_warning(sb, "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT4_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0); + if (IS_ERR(bh)) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + return ext4_group_extend_no_check(sb, o_blocks_count, add); +} /* ext4_group_extend */ + + +static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) +{ + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); +} + +/* + * Release the resize inode and drop the resize_inode feature if there + * are no more reserved gdt blocks, and then convert the file system + * to enable meta_bg + */ +static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) +{ + handle_t *handle; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t nr; + int i, ret, err = 0; + int credits = 1; + + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); + if (inode) { + if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "Unexpected non-zero " + "s_reserved_gdt_blocks"); + return -EPERM; + } + + /* Do a quick sanity check of the resize inode */ + if (inode->i_blocks != 1 << (inode->i_blkbits - + (9 - sbi->s_cluster_bits))) + goto invalid_resize_inode; + for (i = 0; i < EXT4_N_BLOCKS; i++) { + if (i == EXT4_DIND_BLOCK) { + if (ei->i_data[i]) + continue; + else + goto invalid_resize_inode; + } + if (ei->i_data[i]) + goto invalid_resize_inode; + } + credits += 3; /* block bitmap, bg descriptor, resize inode */ + } + + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto errout; + + lock_buffer(sbi->s_sbh); + ext4_clear_feature_resize_inode(sb); + ext4_set_feature_meta_bg(sb); + sbi->s_es->s_first_meta_bg = + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + if (err) { + ext4_std_error(sb, err); + goto errout; + } + + if (inode) { + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + ei->i_data[EXT4_DIND_BLOCK] = 0; + inode->i_blocks = 0; + + err = ext4_mark_inode_dirty(handle, inode); + if (err) + ext4_std_error(sb, err); + } + +errout: + ret = ext4_journal_stop(handle); + return err ? err : ret; + +invalid_resize_inode: + ext4_error(sb, "corrupted/inconsistent resize inode"); + return -EINVAL; +} + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode = NULL; + ext4_grpblk_t add, offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_fsblk_t o_blocks_count; + ext4_fsblk_t n_blocks_count_retry = 0; + unsigned long last_update_time = 0; + int err = 0; + int meta_bg; + unsigned int flexbg_size = ext4_flex_bg_size(sbi); + + /* See if the device is actually as big as what was requested */ + bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); + if (IS_ERR(bh)) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* + * For bigalloc, trim the requested size to the nearest cluster + * boundary to avoid creating an unusable filesystem. We do this + * silently, instead of returning an error, to avoid breaking + * callers that blindly resize the filesystem to the full size of + * the underlying block device. + */ + if (ext4_has_feature_bigalloc(sb)) + n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1); + +retry: + o_blocks_count = ext4_blocks_count(es); + + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); + + n_desc_blocks = num_desc_blocks(sb, n_group + 1); + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); + + meta_bg = ext4_has_feature_meta_bg(sb); + + if (ext4_has_feature_resize_inode(sb)) { + if (meta_bg) { + ext4_error(sb, "resize_inode and meta_bg enabled " + "simultaneously"); + return -EINVAL; + } + if (n_desc_blocks > o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks)) { + n_blocks_count_retry = n_blocks_count; + n_desc_blocks = o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks); + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); + n_group--; /* set to last group number */ + } + + if (!resize_inode) + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, + EXT4_IGET_SPECIAL); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + } + + if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { + err = ext4_convert_meta_bg(sb, resize_inode); + if (err) + goto out; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + if (n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + goto retry; + } + } + + /* + * Make sure the last group has enough space so that it's + * guaranteed to have enough space for all metadata blocks + * that it might need to hold. (We might not need to store + * the inode table blocks in the last block group, but there + * will be cases where this might be needed.) + */ + if ((ext4_group_first_block_no(sb, n_group) + + ext4_group_overhead_blocks(sb, n_group) + 2 + + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { + n_blocks_count = ext4_group_first_block_no(sb, n_group); + n_group--; + n_blocks_count_retry = 0; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + goto retry; + } + + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); + if (add > 0) { + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) + goto out; + + err = ext4_alloc_flex_bg_array(sb, n_group + 1); + if (err) + goto out; + + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); + if (err) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { + if (time_is_before_jiffies(last_update_time + HZ * 10)) { + if (last_update_time) + ext4_msg(sb, KERN_INFO, + "resized to %llu blocks", + ext4_blocks_count(es)); + last_update_time = jiffies; + } + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) + break; + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + + if (!err && n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + free_flex_gd(flex_gd); + flex_gd = NULL; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + goto retry; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + if (resize_inode != NULL) + iput(resize_inode); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); + return err; +} diff --git a/fs/ext4l/symlink.c b/fs/ext4l/symlink.c new file mode 100644 index 00000000000..645240cc022 --- /dev/null +++ b/fs/ext4l/symlink.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 symlink handling code + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include "ext4.h" +#include "xattr.h" + +static const char *ext4_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct buffer_head *bh = NULL; + const void *caddr; + unsigned int max_size; + const char *paddr; + + if (!dentry) + return ERR_PTR(-ECHILD); + + if (ext4_inode_is_fast_symlink(inode)) { + caddr = EXT4_I(inode)->i_data; + max_size = sizeof(EXT4_I(inode)->i_data); + } else { + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + caddr = bh->b_data; + max_size = inode->i_sb->s_blocksize; + } + + paddr = fscrypt_get_symlink(inode, caddr, max_size, done); + brelse(bh); + return paddr; +} + +static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + ext4_getattr(idmap, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + +static void ext4_free_link(void *bh) +{ + brelse(bh); +} + +static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct buffer_head *bh; + char *inline_link; + + /* + * Create a new inlined symlink is not supported, just provide a + * method to read the leftovers. + */ + if (ext4_has_inline_data(inode)) { + if (!dentry) + return ERR_PTR(-ECHILD); + + inline_link = ext4_read_inline_link(inode); + if (!IS_ERR(inline_link)) + set_delayed_call(callback, kfree_link, inline_link); + return inline_link; + } + + if (!dentry) { + bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); + if (IS_ERR(bh) || !bh) + return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } + } else { + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + } + + set_delayed_call(callback, ext4_free_link, bh); + nd_terminate_link(bh->b_data, inode->i_size, + inode->i_sb->s_blocksize - 1); + return bh->b_data; +} + +const struct inode_operations ext4_encrypted_symlink_inode_operations = { + .get_link = ext4_encrypted_get_link, + .setattr = ext4_setattr, + .getattr = ext4_encrypted_symlink_getattr, + .listxattr = ext4_listxattr, +}; + +const struct inode_operations ext4_symlink_inode_operations = { + .get_link = ext4_get_link, + .setattr = ext4_setattr, + .getattr = ext4_getattr, + .listxattr = ext4_listxattr, +}; + +const struct inode_operations ext4_fast_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = ext4_setattr, + .getattr = ext4_getattr, + .listxattr = ext4_listxattr, +}; diff --git a/fs/ext4l/sysfs.c b/fs/ext4l/sysfs.c new file mode 100644 index 00000000000..987bd00f916 --- /dev/null +++ b/fs/ext4l/sysfs.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/sysfs.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Theodore Ts'o (tytso@mit.edu) + * + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/part_stat.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +typedef enum { + attr_noop, + attr_delayed_allocation_blocks, + attr_session_write_kbytes, + attr_lifetime_write_kbytes, + attr_reserved_clusters, + attr_sra_exceeded_retry_limit, + attr_inode_readahead, + attr_trigger_test_error, + attr_first_error_time, + attr_last_error_time, + attr_clusters_in_group, + attr_mb_order, + attr_feature, + attr_pointer_pi, + attr_pointer_ui, + attr_pointer_ul, + attr_pointer_u64, + attr_pointer_u8, + attr_pointer_string, + attr_pointer_atomic, + attr_journal_task, +} attr_id_t; + +typedef enum { + ptr_explicit, + ptr_ext4_sb_info_offset, + ptr_ext4_super_block_offset, +} attr_ptr_t; + +static const char proc_dirname[] = "fs/ext4"; +static struct proc_dir_entry *ext4_proc_root; + +struct ext4_attr { + struct attribute attr; + short attr_id; + short attr_ptr; + unsigned short attr_size; + union { + int offset; + void *explicit_ptr; + } u; +}; + +static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return sysfs_emit(buf, "%lu\n", + (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (t && (!is_power_of_2(t) || t > 0x40000000)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); + int ret; + + ret = kstrtoull(skip_spaces(buf), 0, &val); + if (ret || val >= clusters || (s64)val < 0) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, val); + return count; +} + +static ssize_t trigger_test_error(struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + +static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) +{ + if (!sbi->s_journal) + return sysfs_emit(buf, "<none>\n"); + return sysfs_emit(buf, "%d\n", + task_pid_vnr(sbi->s_journal->j_task)); +} + +#define EXT4_ATTR(_name,_mode,_id) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} + +#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name) + +#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature) + +#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + +#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_pointer_string, \ + .attr_size = _size, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + +#define EXT4_RO_ATTR_ES_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U8(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U64(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ + EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) + +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + +#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) + +#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) + +#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname) + +#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_explicit, \ + .u = { \ + .explicit_ptr = _ptr, \ + }, \ +} + +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); +EXT4_ATTR_FUNC(session_write_kbytes, 0444); +EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); +EXT4_ATTR_FUNC(reserved_clusters, 0644); +EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); + +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, + ext4_sb_info, s_inode_readahead_blks); +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +#ifdef CONFIG_EXT4_DEBUG +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); +#endif +EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count); +EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); +EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); +EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); +EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); +EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); +EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); +EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); +EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); +EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); +EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); +EXT4_ATTR(first_error_time, 0444, first_error_time); +EXT4_ATTR(last_error_time, 0444, last_error_time); +EXT4_ATTR(journal_task, 0444, journal_task); +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); +EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); + +static unsigned int old_bump_val = 128; +EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), + ATTR_LIST(sra_exceeded_retry_limit), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_linear_groups), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(mb_best_avail_max_trim_order), + ATTR_LIST(errors_count), + ATTR_LIST(warning_count), + ATTR_LIST(msg_count), + ATTR_LIST(first_error_ino), + ATTR_LIST(last_error_ino), + ATTR_LIST(first_error_block), + ATTR_LIST(last_error_block), + ATTR_LIST(first_error_line), + ATTR_LIST(last_error_line), + ATTR_LIST(first_error_func), + ATTR_LIST(last_error_func), + ATTR_LIST(first_error_errcode), + ATTR_LIST(last_error_errcode), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), + ATTR_LIST(journal_task), +#ifdef CONFIG_EXT4_DEBUG + ATTR_LIST(simulate_fail), +#endif + ATTR_LIST(mb_prefetch), + ATTR_LIST(mb_prefetch_limit), + ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), + NULL, +}; +ATTRIBUTE_GROUPS(ext4); + +/* Features this copy of ext4 supports */ +EXT4_ATTR_FEATURE(lazy_itable_init); +EXT4_ATTR_FEATURE(batched_discard); +EXT4_ATTR_FEATURE(meta_bg_resize); +#ifdef CONFIG_FS_ENCRYPTION +EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); +#endif +#if IS_ENABLED(CONFIG_UNICODE) +EXT4_ATTR_FEATURE(casefold); +#endif +#ifdef CONFIG_FS_VERITY +EXT4_ATTR_FEATURE(verity); +#endif +EXT4_ATTR_FEATURE(metadata_csum_seed); +EXT4_ATTR_FEATURE(fast_commit); +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +EXT4_ATTR_FEATURE(encrypted_casefold); +#endif + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), +#ifdef CONFIG_FS_ENCRYPTION + ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), +#endif +#if IS_ENABLED(CONFIG_UNICODE) + ATTR_LIST(casefold), +#endif +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif + ATTR_LIST(metadata_csum_seed), + ATTR_LIST(fast_commit), +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) + ATTR_LIST(encrypted_casefold), +#endif + NULL, +}; +ATTRIBUTE_GROUPS(ext4_feat); + +static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) +{ + switch (a->attr_ptr) { + case ptr_explicit: + return a->u.explicit_ptr; + case ptr_ext4_sb_info_offset: + return (void *) (((char *) sbi) + a->u.offset); + case ptr_ext4_super_block_offset: + return (void *) (((char *) sbi->s_es) + a->u.offset); + } + return NULL; +} + +static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) +{ + return sysfs_emit(buf, "%lld\n", + ((time64_t)hi << 32) + le32_to_cpu(lo)); +} + +#define print_tstamp(buf, es, tstamp) \ + __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) + +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_clusters_in_group: + case attr_mb_order: + case attr_pointer_pi: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_delayed_allocation_blocks: + return sysfs_emit(buf, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + case attr_session_write_kbytes: + return session_write_kbytes_show(sbi, buf); + case attr_lifetime_write_kbytes: + return lifetime_write_kbytes_show(sbi, buf); + case attr_reserved_clusters: + return sysfs_emit(buf, "%llu\n", + (unsigned long long) + atomic64_read(&sbi->s_resv_clusters)); + case attr_sra_exceeded_retry_limit: + return sysfs_emit(buf, "%llu\n", + (unsigned long long) + percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); + case attr_feature: + return sysfs_emit(buf, "supported\n"); + case attr_first_error_time: + return print_tstamp(buf, sbi->s_es, s_first_error_time); + case attr_last_error_time: + return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_journal_task: + return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); + } +} + +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) +{ + int ret; + unsigned int t; + unsigned long lt; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_pointer_ui: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (a->attr_ptr == ptr_ext4_super_block_offset) + *((__le32 *) ptr) = cpu_to_le32(t); + else + *((unsigned int *) ptr) = t; + return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_clusters_in_group: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > sbi->s_clusters_per_group) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_pointer_ul: + ret = kstrtoul(skip_spaces(buf), 0, <); + if (ret) + return ret; + *((unsigned long *) ptr) = lt; + return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); + case attr_inode_readahead: + return inode_readahead_blks_store(sbi, buf, len); + case attr_trigger_test_error: + return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); + } +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static void ext4_feat_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static const struct kobj_type ext4_sb_ktype = { + .default_groups = ext4_groups, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static const struct kobj_type ext4_feat_ktype = { + .default_groups = ext4_feat_groups, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_feat_release, +}; + +void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) +{ + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); +} + +static struct kobject *ext4_root; + +static struct kobject *ext4_feat; + +int ext4_register_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, + "%s", sb->s_id); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; + } + + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) { + proc_create_single_data("options", S_IRUGO, sbi->s_proc, + ext4_seq_options_show, sb); + proc_create_single_data("es_shrinker_info", S_IRUGO, + sbi->s_proc, ext4_seq_es_shrinker_info_show, + sb); + proc_create_single_data("fc_info", 0444, sbi->s_proc, + ext4_fc_info_show, sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, + &ext4_mb_seq_structs_summary_ops, sb); + } + return 0; +} + +void ext4_unregister_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_proc) + remove_proc_subtree(sb->s_id, ext4_proc_root); + kobject_del(&sbi->s_kobj); +} + +int __init ext4_init_sysfs(void) +{ + int ret; + + ext4_root = kobject_create_and_add("ext4", fs_kobj); + if (!ext4_root) + return -ENOMEM; + + ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); + if (!ext4_feat) { + ret = -ENOMEM; + goto root_err; + } + + ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, + ext4_root, "features"); + if (ret) + goto feat_err; + + ext4_proc_root = proc_mkdir(proc_dirname, NULL); + return ret; + +feat_err: + kobject_put(ext4_feat); + ext4_feat = NULL; +root_err: + kobject_put(ext4_root); + ext4_root = NULL; + return ret; +} + +void ext4_exit_sysfs(void) +{ + kobject_put(ext4_feat); + ext4_feat = NULL; + kobject_put(ext4_root); + ext4_root = NULL; + remove_proc_entry(proc_dirname, NULL); + ext4_proc_root = NULL; +} + diff --git a/fs/ext4l/truncate.h b/fs/ext4l/truncate.h new file mode 100644 index 00000000000..ce84aa2786c --- /dev/null +++ b/fs/ext4l/truncate.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * We don't need to call ext4_break_layouts() because the blocks we + * are truncating were never visible to userspace. + */ + filemap_invalidate_lock(mapping); + truncate_inode_pages(mapping, inode->i_size); + ext4_truncate(inode); + filemap_invalidate_unlock(mapping); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/fs/ext4l/verity.c b/fs/ext4l/verity.c new file mode 100644 index 00000000000..b0acb0c5031 --- /dev/null +++ b/fs/ext4l/verity.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/verity.c: fs-verity support for ext4 + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for ext4. + * + * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past + * the end of the file, starting at the first 64K boundary beyond i_size. This + * approach works because (a) verity files are readonly, and (b) pages fully + * beyond i_size aren't visible to userspace but can be read/written internally + * by ext4 with only some relatively small changes to ext4. This approach + * avoids having to depend on the EA_INODE feature and on rearchitecturing + * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and + * to support encrypting xattrs. Note that the verity metadata *must* be + * encrypted when the file is, since it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/quotaops.h> + +#include "ext4.h" +#include "ext4_extents.h" +#include "ext4_jbd2.h" + +static inline loff_t ext4_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + struct folio *folio; + size_t n; + + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + n = memcpy_from_file_folio(buf, folio, pos, count); + folio_put(folio); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct folio *folio; + void *fsdata = NULL; + int res; + + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); + if (res) + return res; + + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); + + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +static int ext4_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_add() */ + handle_t *handle; + int err; + + if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EINVAL; + + if (ext4_verity_in_progress(inode)) + return -EBUSY; + + /* + * Since the file was opened readonly, we have to initialize the jbd + * inode and quotas here and not rely on ->open() doing it. This must + * be done before evicting the inline data. + */ + + err = ext4_inode_attach_jinode(inode); + if (err) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_convert_inline_data(inode); + if (err) + return err; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_warning_inode(inode, + "verity is only allowed on extent-based files"); + return -EOPNOTSUPP; + } + + /* + * ext4 uses the last allocated block to find the verity descriptor, so + * we must remove any other blocks past EOF which might confuse things. + */ + err = ext4_truncate(inode); + if (err) + return err; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err == 0) + ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + ext4_journal_stop(handle); + return err; +} + +/* + * ext4 stores the verity descriptor beginning on the next filesystem block + * boundary after the Merkle tree. Then, the descriptor size is stored in the + * last 4 bytes of the last allocated filesystem block --- which is either the + * block in which the descriptor ends, or the next block after that if there + * weren't at least 4 bytes remaining. + * + * We can't simply store the descriptor in an xattr because it *must* be + * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt + * xattrs. Also, if the descriptor includes a large signature blob it may be + * too large to store in an xattr without the EA_INODE feature. + */ +static int ext4_write_verity_descriptor(struct inode *inode, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) + + merkle_tree_size, i_blocksize(inode)); + const u64 desc_end = desc_pos + desc_size; + const __le32 desc_size_disk = cpu_to_le32(desc_size); + const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk), + i_blocksize(inode)) - + sizeof(desc_size_disk); + int err; + + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + return err; + + return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); +} + +static int ext4_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_del() */ + handle_t *handle; + struct ext4_iloc iloc; + int err = 0; + + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; + + /* Append the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + if (err) + goto cleanup; + + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages + * beyond i_size won't be written properly. For crash consistency, this + * also must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* + * Finally, set the verity inode flag and remove the inode from the + * orphan list (in a single transaction). + */ + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto cleanup; + } + + err = ext4_orphan_del(handle, inode); + if (err) + goto stop_and_cleanup; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; + + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode, false); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; + + ext4_journal_stop(handle); + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return 0; + +stop_and_cleanup: + ext4_journal_stop(handle); +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk), removing the inode from the orphan list (if it wasn't done + * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS. + */ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); + ext4_orphan_del(NULL, inode); + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return err; +} + +static int ext4_get_verity_descriptor_location(struct inode *inode, + size_t *desc_size_ret, + u64 *desc_pos_ret) +{ + struct ext4_ext_path *path; + struct ext4_extent *last_extent; + u32 end_lblk; + u64 desc_size_pos; + __le32 desc_size_disk; + u32 desc_size; + u64 desc_pos; + int err; + + /* + * Descriptor size is in last 4 bytes of last allocated block. + * See ext4_write_verity_descriptor(). + */ + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + EXT4_ERROR_INODE(inode, "verity file doesn't use extents"); + return -EFSCORRUPTED; + } + + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + last_extent = path[path->p_depth].p_ext; + if (!last_extent) { + EXT4_ERROR_INODE(inode, "verity file has no extents"); + ext4_free_ext_path(path); + return -EFSCORRUPTED; + } + + end_lblk = le32_to_cpu(last_extent->ee_block) + + ext4_ext_get_actual_len(last_extent); + desc_size_pos = (u64)end_lblk << inode->i_blkbits; + ext4_free_ext_path(path); + + if (desc_size_pos < sizeof(desc_size_disk)) + goto bad; + desc_size_pos -= sizeof(desc_size_disk); + + err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); + if (err) + return err; + desc_size = le32_to_cpu(desc_size_disk); + + /* + * The descriptor is stored just before the desc_size_disk, but starting + * on a filesystem block boundary. + */ + + if (desc_size > INT_MAX || desc_size > desc_size_pos) + goto bad; + + desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode)); + if (desc_pos < ext4_verity_metadata_pos(inode)) + goto bad; + + *desc_size_ret = desc_size; + *desc_pos_ret = desc_pos; + return 0; + +bad: + EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor"); + return -EFSCORRUPTED; +} + +static int ext4_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + size_t desc_size = 0; + u64 desc_pos = 0; + int err; + + err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos); + if (err) + return err; + + if (buf_size) { + if (desc_size > buf_size) + return -ERANGE; + err = pagecache_read(inode, buf, desc_size, desc_pos); + if (err) + return err; + } + return desc_size; +} + +static struct page *ext4_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct folio *folio; + + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; + + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + + if (!IS_ERR(folio)) + folio_put(folio); + else if (num_ra_pages > 1) + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); + folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); + } + return folio_file_page(folio, index); +} + +static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 pos, unsigned int size) +{ + pos += ext4_verity_metadata_pos(inode); + + return pagecache_write(inode, buf, size, pos); +} + +const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), + .begin_enable_verity = ext4_begin_enable_verity, + .end_enable_verity = ext4_end_enable_verity, + .get_verity_descriptor = ext4_get_verity_descriptor, + .read_merkle_tree_page = ext4_read_merkle_tree_page, + .write_merkle_tree_block = ext4_write_merkle_tree_block, +}; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy xattr.c, xattr.h, xattr_hurd.c, xattr_security.c, xattr_trusted.c, and xattr_user.c from Linux v6.18 fs/ext4 directory. These files implement extended attribute support for ext4: - xattr.c/h: core xattr implementation - xattr_hurd: Hurd translator xattr handler - xattr_security: security label xattr handler - xattr_trusted: trusted xattr handler - xattr_user: user xattr handler Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/xattr.c | 3225 +++++++++++++++++++++++++++++++++++++ fs/ext4l/xattr.h | 236 +++ fs/ext4l/xattr_hurd.c | 52 + fs/ext4l/xattr_security.c | 66 + fs/ext4l/xattr_trusted.c | 47 + fs/ext4l/xattr_user.c | 50 + 6 files changed, 3676 insertions(+) create mode 100644 fs/ext4l/xattr.c create mode 100644 fs/ext4l/xattr.h create mode 100644 fs/ext4l/xattr_hurd.c create mode 100644 fs/ext4l/xattr_security.c create mode 100644 fs/ext4l/xattr_trusted.c create mode 100644 fs/ext4l/xattr_user.c diff --git a/fs/ext4l/xattr.c b/fs/ext4l/xattr.c new file mode 100644 index 00000000000..ce7253b3f54 --- /dev/null +++ b/fs/ext4l/xattr.c @@ -0,0 +1,3225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * Fix by Harrison Xing <harrison@mountainviewdata.com>. + * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko <luka.renko@hermes.si>. + * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, + * Red Hat Inc. + * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz + * and Andreas Gruenbacher <agruen@suse.de>. + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/mbcache.h> +#include <linux/quotaops.h> +#include <linux/iversion.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +#ifdef EXT4_XATTR_DEBUG +# define ea_idebug(inode, fmt, ...) \ + printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) \ + printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ + bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) +#else +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +static void ext4_xattr_block_cache_insert(struct mb_cache *, + struct buffer_head *); +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, + struct mb_cache_entry **); +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count); +static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, + size_t value_count); +static void ext4_xattr_rehash(struct ext4_xattr_header *); + +static const struct xattr_handler * const ext4_xattr_handler_map[] = { + [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, +#endif + [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, +#endif + [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, +}; + +const struct xattr_handler * const ext4_xattr_handlers[] = { + &ext4_xattr_user_handler, + &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + &ext4_xattr_security_handler, +#endif + &ext4_xattr_hurd_handler, + NULL +}; + +#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_block_cache) + +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_inode_cache) + +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode); + +#ifdef CONFIG_LOCKDEP +void ext4_xattr_inode_set_class(struct inode *ea_inode) +{ + struct ext4_inode_info *ei = EXT4_I(ea_inode); + + lockdep_set_subclass(&ea_inode->i_rwsem, 1); + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); +} +#endif + +static __le32 ext4_xattr_block_csum(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); + __u32 dummy_csum = 0; + int offset = offsetof(struct ext4_xattr_header, h_checksum); + + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + csum = ext4_chksum(csum, (__u8 *)hdr + offset, + EXT4_BLOCK_SIZE(inode->i_sb) - offset); + + return cpu_to_le32(csum); +} + +static int ext4_xattr_block_csum_verify(struct inode *inode, + struct buffer_head *bh) +{ + struct ext4_xattr_header *hdr = BHDR(bh); + int ret = 1; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) { + lock_buffer(bh); + ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, + bh->b_blocknr, hdr)); + unlock_buffer(bh); + } + return ret; +} + +static void ext4_xattr_block_csum_set(struct inode *inode, + struct buffer_head *bh) +{ + if (ext4_has_feature_metadata_csum(inode->i_sb)) + BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, + bh->b_blocknr, BHDR(bh)); +} + +static inline const char *ext4_xattr_prefix(int name_index, + struct dentry *dentry) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) + handler = ext4_xattr_handler_map[name_index]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); +} + +static int +check_xattrs(struct inode *inode, struct buffer_head *bh, + struct ext4_xattr_entry *entry, void *end, void *value_start, + const char *function, unsigned int line) +{ + struct ext4_xattr_entry *e = entry; + int err = -EFSCORRUPTED; + char *err_str; + + if (bh) { + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + err_str = "invalid header"; + goto errout; + } + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh)) { + err = -EFSBADCRC; + err_str = "invalid checksum"; + goto errout; + } + } else { + struct ext4_xattr_ibody_header *header = value_start; + + header -= 1; + if (end - (void *)header < sizeof(*header) + sizeof(u32)) { + err_str = "in-inode xattr block too small"; + goto errout; + } + if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + err_str = "bad magic number in in-inode xattr"; + goto errout; + } + } + + /* Find the end of the names list */ + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); + if ((void *)next >= end) { + err_str = "e_name out of bounds"; + goto errout; + } + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { + err_str = "bad e_name length"; + goto errout; + } + e = next; + } + + /* Check the values */ + while (!IS_LAST_ENTRY(entry)) { + u32 size = le32_to_cpu(entry->e_value_size); + unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); + + if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { + err_str = "ea_inode specified without ea_inode feature enabled"; + goto errout; + } + if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || + !ext4_valid_inum(inode->i_sb, ea_ino))) { + err_str = "invalid ea_ino"; + goto errout; + } + if (ea_ino && !size) { + err_str = "invalid size in ea xattr"; + goto errout; + } + if (size > EXT4_XATTR_SIZE_MAX) { + err_str = "e_value size too large"; + goto errout; + } + + if (size != 0 && entry->e_value_inum == 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + void *value; + + /* + * The value cannot overlap the names, and the value + * with padding cannot extend beyond 'end'. Check both + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ + if (offs > end - value_start) { + err_str = "e_value out of bounds"; + goto errout; + } + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || + EXT4_XATTR_SIZE(size) > end - value) { + err_str = "overlapping e_value "; + goto errout; + } + } + entry = EXT4_XATTR_NEXT(entry); + } + if (bh) + set_buffer_verified(bh); + return 0; + +errout: + if (bh) + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted xattr block %llu: %s", + (unsigned long long) bh->b_blocknr, + err_str); + else + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted in-inode xattr: %s", err_str); + return err; +} + +static inline int +__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, + const char *function, unsigned int line) +{ + return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data, function, line); +} + +#define ext4_xattr_check_block(inode, bh) \ + __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) + + +int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line) +{ + return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), + function, line); +} + +static int +xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, + void *end, int name_index, const char *name, int sorted) +{ + struct ext4_xattr_entry *entry, *next; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { + next = EXT4_XATTR_NEXT(entry); + if ((void *) next >= end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + return -EFSCORRUPTED; + } + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (!cmp || (cmp < 0 && sorted)) + break; + } + *pentry = entry; + return cmp ? -ENODATA : 0; +} + +static u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + return ext4_chksum(sbi->s_csum_seed, buffer, size); +} + +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) +{ + return ((u64) inode_get_ctime_sec(ea_inode) << 32) | + (u32) inode_peek_iversion_raw(ea_inode); +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) +{ + inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); + inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); +} + +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) +{ + return (u32) inode_get_atime_sec(ea_inode); +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) +{ + inode_set_atime(ea_inode, hash, 0); +} + +/* + * Read the EA value from an inode. + */ +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) +{ + int blocksize = 1 << ea_inode->i_blkbits; + int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; + int tail_size = (size % blocksize) ?: blocksize; + struct buffer_head *bhs_inline[8]; + struct buffer_head **bhs = bhs_inline; + int i, ret; + + if (bh_count > ARRAY_SIZE(bhs_inline)) { + bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); + if (!bhs) + return -ENOMEM; + } + + ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, + true /* wait */, bhs); + if (ret) + goto free_bhs; + + for (i = 0; i < bh_count; i++) { + /* There shouldn't be any holes in ea_inode. */ + if (!bhs[i]) { + ret = -EFSCORRUPTED; + goto put_bhs; + } + memcpy((char *)buf + blocksize * i, bhs[i]->b_data, + i < bh_count - 1 ? blocksize : tail_size); + } + ret = 0; +put_bhs: + for (i = 0; i < bh_count; i++) + brelse(bhs[i]); +free_bhs: + if (bhs != bhs_inline) + kfree(bhs); + return ret; +} + +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) + +static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + u32 ea_inode_hash, struct inode **ea_inode) +{ + struct inode *inode; + int err; + + /* + * We have to check for this corruption early as otherwise + * iget_locked() could wait indefinitely for the state of our + * parent inode. + */ + if (parent->i_ino == ea_ino) { + ext4_error(parent->i_sb, + "Parent and EA inode have the same ino %lu", ea_ino); + return -EFSCORRUPTED; + } + + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(parent->i_sb, + "error while reading EA inode %lu err=%d", ea_ino, + err); + return err; + } + ext4_xattr_inode_set_class(inode); + + /* + * Check whether this is an old Lustre-style xattr inode. Lustre + * implementation does not have hash validation, rather it has a + * backpointer from ea_inode to the parent inode. + */ + if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && + EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && + inode->i_generation == parent->i_generation) { + ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); + ext4_xattr_inode_set_ref(inode, 1); + } else { + inode_lock_nested(inode, I_MUTEX_XATTR); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + + *ea_inode = inode; + return 0; +} + +/* Remove entry from mbcache when EA inode is getting evicted */ +void ext4_evict_ea_inode(struct inode *inode) +{ + struct mb_cache_entry *oe; + + if (!EA_INODE_CACHE(inode)) + return; + /* Wait for entry to get unused so that we can remove it */ + while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), + ext4_xattr_inode_get_hash(inode), inode->i_ino))) { + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(EA_INODE_CACHE(inode), oe); + } +} + +static int +ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, + size_t size) +{ + u32 hash; + + /* Verify stored hash matches calculated hash. */ + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); + if (hash != ext4_xattr_inode_get_hash(ea_inode)) + return -EFSCORRUPTED; + + if (entry) { + __le32 e_hash, tmp_data; + + /* Verify entry hash. */ + tmp_data = cpu_to_le32(hash); + e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, + &tmp_data, 1); + /* All good? */ + if (e_hash == entry->e_hash) + return 0; + + /* + * Not good. Maybe the entry hash was calculated + * using the buggy signed char version? + */ + e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, + &tmp_data, 1); + /* Still no match - bad */ + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + + /* Let people know about old hash */ + pr_warn_once("ext4: filesystem with signed xattr name hash"); + } + return 0; +} + +/* + * Read xattr value from the EA inode. + */ +static int +ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, + void *buffer, size_t size) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + struct inode *ea_inode; + int err; + + err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), &ea_inode); + if (err) { + ea_inode = NULL; + goto out; + } + + if (i_size_read(ea_inode) != size) { + ext4_warning_inode(ea_inode, + "ea_inode file size=%llu entry size=%zu", + i_size_read(ea_inode), size); + err = -EFSCORRUPTED; + goto out; + } + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + if (err) + goto out; + + if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, + size); + if (err) { + ext4_warning_inode(ea_inode, + "EA inode hash validation failed"); + goto out; + } + + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); + } +out: + iput(ea_inode); + return err; +} + +static int +ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_entry *entry; + size_t size; + void *end; + int error; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + if (!EXT4_I(inode)->i_file_acl) + return -ENODATA; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + error = ext4_xattr_check_block(inode, bh); + if (error) + goto cleanup; + ext4_xattr_block_cache_insert(ea_block_cache, bh); + entry = BFIRST(bh); + end = bh->b_data + bh->b_size; + error = xattr_find_entry(inode, &entry, end, name_index, name, 1); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; + if (buffer) { + if (size > buffer_size) + goto cleanup; + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = bh->b_data + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); + } + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +int +ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = ITAIL(inode, raw_inode); + entry = IFIRST(header); + error = xattr_find_entry(inode, &entry, end, name_index, name, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; + if (buffer) { + if (size > buffer_size) + goto cleanup; + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = (void *)IFIRST(header) + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); + } + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + if (strlen(name) > 255) + return -ERANGE; + + down_read(&EXT4_I(inode)->xattr_sem); + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +static int +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + const char *prefix; + + prefix = ext4_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { + size_t prefix_len = strlen(prefix); + size_t size = prefix_len + entry->e_name_len + 1; + + if (buffer) { + if (size > rest) + return -ERANGE; + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; + } + rest -= size; + } + } + return buffer_size - rest; /* total size */ +} + +static int +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + if (!EXT4_I(inode)->i_file_acl) + return 0; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + error = ext4_xattr_check_block(inode, bh); + if (error) + goto cleanup; + ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, + buffer_size); +cleanup: + brelse(bh); + return error; +} + +static int +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + error = ext4_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + + brelse(iloc.bh); + return error; +} + +/* + * Inode operation listxattr() + * + * d_inode(dentry)->i_rwsem: don't care + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret, ret2; + + down_read(&EXT4_I(d_inode(dentry))->xattr_sem); + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; + } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); + return ret; +} + +/* + * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext4_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (ext4_has_feature_xattr(sb)) + return; + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE) == 0) { + lock_buffer(EXT4_SB(sb)->s_sbh); + ext4_set_feature_xattr(sb); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + } +} + +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) +{ + struct ext4_iloc iloc = { .bh = NULL }; + struct buffer_head *bh = NULL; + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + qsize_t ea_inode_refs = 0; + int ret; + + lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); + + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + + if (EXT4_I(inode)->i_file_acl) { + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + bh = NULL; + goto out; + } + + ret = ext4_xattr_check_block(inode, bh); + if (ret) + goto out; + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + *usage = ea_inode_refs + 1; + ret = 0; +out: + brelse(iloc.bh); + brelse(bh); + return ret; +} + +static inline size_t round_up_cluster(struct inode *inode, size_t length) +{ + struct super_block *sb = inode->i_sb; + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + + inode->i_blkbits); + size_t mask = ~(cluster_size - 1); + + return (length + cluster_size - 1) & mask; +} + +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) +{ + int err; + + err = dquot_alloc_inode(inode); + if (err) + return err; + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); + if (err) + dquot_free_inode(inode); + return err; +} + +static void ext4_xattr_inode_free_quota(struct inode *parent, + struct inode *ea_inode, + size_t len) +{ + if (ea_inode && + ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) + return; + dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); + dquot_free_inode(parent); +} + +int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create) +{ + int credits; + int blocks; + + /* + * 1) Owner inode update + * 2) Ref count update on old xattr block + * 3) new xattr block + * 4) block bitmap update for new xattr block + * 5) group descriptor for new xattr block + * 6) block bitmap update for old xattr block + * 7) group descriptor for old block + * + * 6 & 7 can happen if we have two racing threads T_a and T_b + * which are each trying to set an xattr on inodes I_a and I_b + * which were both initially sharing an xattr block. + */ + credits = 7; + + /* Quota updates. */ + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (inode && ext4_has_inline_data(inode)) + credits += ext4_chunk_trans_extent(inode, 1) + 1; + + /* We are done if ea_inode feature is not enabled. */ + if (!ext4_has_feature_ea_inode(sb)) + return credits; + + /* New ea_inode, inode map, block bitmap, group descriptor. */ + credits += 4; + + /* Data blocks. */ + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* Blocks themselves. */ + credits += blocks; + + if (!is_create) { + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; + + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree for old + * ea_inode. + */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + } + + /* We may need to clone the existing xattr block in which case we need + * to increment ref counts for existing ea_inodes referenced by it. + */ + if (block_bh) { + struct ext4_xattr_entry *entry = BFIRST(block_bh); + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + /* Ref count update on ea_inode. */ + credits += 1; + } + return credits; +} + +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) +{ + struct ext4_iloc iloc; + u64 ref_count; + int ret; + + inode_lock_nested(ea_inode, I_MUTEX_XATTR); + + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); + if (ret) + goto out; + + ref_count = ext4_xattr_inode_get_ref(ea_inode); + if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { + ext4_error_inode(ea_inode, __func__, __LINE__, 0, + "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + ea_inode->i_ino, ref_count, ref_change); + ret = -EFSCORRUPTED; + goto out; + } + ref_count += ref_change; + ext4_xattr_inode_set_ref(ea_inode, ref_count); + + if (ref_change > 0) { + if (ref_count == 1) { + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); + } + } else { + if (ref_count == 0) { + WARN_ONCE(ea_inode->i_nlink != 1, + "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + } + } + + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); + if (ret) + ext4_warning_inode(ea_inode, + "ext4_mark_iloc_dirty() failed ret=%d", ret); +out: + inode_unlock(ea_inode); + return ret; +} + +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); +} + +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); +} + +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, + struct ext4_xattr_entry *first) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *failed_entry; + unsigned int ea_ino; + int err, saved_err; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) + goto cleanup; + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "inc ref error %d", err); + iput(ea_inode); + goto cleanup; + } + iput(ea_inode); + } + return 0; + +cleanup: + saved_err = err; + failed_entry = entry; + + for (entry = first; entry != failed_entry; + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) { + ext4_warning(parent->i_sb, + "cleanup ea_ino %u iget error %d", ea_ino, + err); + continue; + } + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", + err); + iput(ea_inode); + } + return saved_err; +} + +static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, bool block_csum, bool dirty) +{ + int error; + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + return 0; +} + +static void +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits, bool skip_quota) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; + bool dirty = false; + unsigned int ea_ino; + int err; + int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } + + /* One credit for dec ref on ea_inode, one for orphan list addition, */ + credits = 2 + extra_credits; + + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) + continue; + + err = ext4_expand_inode_array(ea_inode_array, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, + "Expand inode array err=%d", err); + iput(ea_inode); + continue; + } + + err = ext4_journal_ensure_credits_fn(handle, credits, credits, + ext4_free_metadata_revoke_credits(parent->i_sb, 1), + ext4_xattr_restart_fn(handle, parent, bh, block_csum, + dirty)); + if (err < 0) { + ext4_warning_inode(ea_inode, "Ensure credits err=%d", + err); + continue; + } + if (err > 0) { + err = ext4_journal_get_write_access(handle, + parent->i_sb, bh, EXT4_JTR_NONE); + if (err) { + ext4_warning_inode(ea_inode, + "Re-get write access err=%d", + err); + continue; + } + } + + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", + err); + continue; + } + + if (!skip_quota) + ext4_xattr_inode_free_quota(parent, ea_inode, + le32_to_cpu(entry->e_value_size)); + + /* + * Forget about ea_inode within the same transaction that + * decrements the ref count. This avoids duplicate decrements in + * case the rest of the work spills over to subsequent + * transactions. + */ + entry->e_value_inum = 0; + entry->e_value_size = 0; + + dirty = true; + } + + if (dirty) { + /* + * Note that we are deliberately skipping csum calculation for + * the final update because we do not expect any journal + * restarts until xattr block is freed. + */ + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + ext4_warning_inode(parent, + "handle dirty metadata err=%d", err); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. + */ +static void +ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) +{ + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + u32 hash, ref; + int error = 0; + + BUFFER_TRACE(bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (error) + goto out; + +retry_ref: + lock_buffer(bh); + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { + ea_bdebug(bh, "refcount now=0; freeing"); + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, + bh->b_blocknr); + if (oe) { + unlock_buffer(bh); + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(ea_block_cache, oe); + goto retry_ref; + } + } + get_bh(bh); + unlock_buffer(bh); + + if (ext4_has_feature_ea_inode(inode->i_sb)) + ext4_xattr_inode_dec_ref_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits, + true /* skip_quota */); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } else { + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { + struct mb_cache_entry *ce; + + if (ea_block_cache) { + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { + set_bit(MBE_REUSABLE_B, &ce->e_flags); + mb_cache_entry_put(ea_block_cache, ce); + } + } + } + + ext4_xattr_block_csum_set(inode, bh); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_metadata() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_metadata(handle, inode, bh); + unlock_buffer(bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_metadata(handle, inode, bh); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } +out: + ext4_std_error(inode->i_sb, error); + return; +} + +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + +/* + * Write the value of the EA in an inode. + */ +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + int blocksize = ea_inode->i_sb->s_blocksize; + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0, ret2 = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) { + WARN_ON_ONCE(1); + EXT4_ERROR_INODE(ea_inode, + "ext4_getblk() return bh = NULL"); + return -EFSCORRUPTED; + } + ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, + EXT4_JTR_NONE); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + /* + * Zero out block tail to avoid writing uninitialized memory + * to disk. + */ + if (csize < blocksize) + memset(bh->b_data + csize, 0, blocksize - csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + inode_lock(ea_inode); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + inode_unlock(ea_inode); + + ret2 = ext4_mark_inode_dirty(handle, ea_inode); + if (unlikely(ret2 && !ret)) + ret = ret2; + +out: + brelse(bh); + + return ret; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode *ext4_xattr_inode_create(handle_t *handle, + struct inode *inode, u32 hash) +{ + struct inode *ea_inode = NULL; + uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; + int err; + + if (inode->i_sb->s_root == NULL) { + ext4_warning(inode->i_sb, + "refuse to create EA inode when umounting"); + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG | 0600, NULL, inode->i_ino + 1, owner, + EXT4_EA_INODE_FL); + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations; + ext4_set_aops(ea_inode); + ext4_xattr_inode_set_class(ea_inode); + unlock_new_inode(ea_inode); + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + err = ext4_mark_inode_dirty(handle, ea_inode); + if (!err) + err = ext4_inode_attach_jinode(ea_inode); + if (err) { + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, + "cleanup dec ref error %d", err); + iput(ea_inode); + return ERR_PTR(err); + } + + /* + * Xattr inodes are shared therefore quota charging is performed + * at a higher level. + */ + dquot_free_inode(ea_inode); + dquot_drop(ea_inode); + inode_lock(ea_inode); + ea_inode->i_flags |= S_NOQUOTA; + inode_unlock(ea_inode); + } + + return ea_inode; +} + +static struct inode * +ext4_xattr_inode_cache_find(struct inode *inode, const void *value, + size_t value_len, u32 hash) +{ + struct inode *ea_inode; + struct mb_cache_entry *ce; + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + void *ea_data; + + if (!ea_inode_cache) + return NULL; + + ce = mb_cache_entry_find_first(ea_inode_cache, hash); + if (!ce) + return NULL; + + WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && + !(current->flags & PF_MEMALLOC_NOFS)); + + ea_data = kvmalloc(value_len, GFP_NOFS); + if (!ea_data) { + mb_cache_entry_put(ea_inode_cache, ce); + return NULL; + } + + while (ce) { + ea_inode = ext4_iget(inode->i_sb, ce->e_value, + EXT4_IGET_EA_INODE); + if (IS_ERR(ea_inode)) + goto next_entry; + ext4_xattr_inode_set_class(ea_inode); + if (i_size_read(ea_inode) == value_len && + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && + !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, + value_len) && + !memcmp(value, ea_data, value_len)) { + mb_cache_entry_touch(ea_inode_cache, ce); + mb_cache_entry_put(ea_inode_cache, ce); + kvfree(ea_data); + return ea_inode; + } + iput(ea_inode); + next_entry: + ce = mb_cache_entry_find_next(ea_inode_cache, ce); + } + kvfree(ea_data); + return NULL; +} + +/* + * Add value of the EA in an inode. + */ +static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, + struct inode *inode, const void *value, size_t value_len) +{ + struct inode *ea_inode; + u32 hash; + int err; + + /* Account inode & space to quota even if sharing... */ + err = ext4_xattr_inode_alloc_quota(inode, value_len); + if (err) + return ERR_PTR(err); + + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); + if (ea_inode) { + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) + goto out_err; + return ea_inode; + } + + /* Create an inode for the EA value */ + ea_inode = ext4_xattr_inode_create(handle, inode, hash); + if (IS_ERR(ea_inode)) { + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ea_inode; + } + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) { + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); + goto out_err; + } + + if (EA_INODE_CACHE(inode)) + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); + return ea_inode; +out_err: + iput(ea_inode); + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ERR_PTR(err); +} + +/* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. + */ +#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) + +static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, + bool is_block) +{ + struct ext4_xattr_entry *last, *next; + struct ext4_xattr_entry *here = s->here; + size_t min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; + size_t old_size, new_size; + int ret; + + /* Space used by old and new values. */ + old_size = (!s->not_found && !here->e_value_inum) ? + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; + + /* + * Optimization for the simple case when old and new values have the + * same padded sizes. Not applicable if external inodes are involved. + */ + if (new_size && new_size == old_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + here->e_value_size = cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, new_size - i->value_len); + } + goto update_hash; + } + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = next) { + next = EXT4_XATTR_NEXT(last); + if ((void *)next >= s->end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + ret = -EFSCORRUPTED; + goto out; + } + if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + + /* Check whether we have enough space. */ + if (i->value) { + size_t free; + + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) + free += EXT4_XATTR_LEN(name_len) + old_size; + + if (free < EXT4_XATTR_LEN(name_len) + new_size) { + ret = -ENOSPC; + goto out; + } + + /* + * If storing the value in an external inode is an option, + * reserve space for xattr entries/names in the external + * attribute block so that a long value does not occupy the + * whole space and prevent further entries being added. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + new_size && is_block && + (min_offs + old_size - new_size) < + EXT4_XATTR_BLOCK_RESERVE(inode)) { + ret = -ENOSPC; + goto out; + } + } + + /* + * Getting access to old and new ea inodes is subject to failures. + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { + ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), + &old_ea_inode); + if (ret) { + old_ea_inode = NULL; + goto out; + } + + /* We are ready to release ref count on the old_ea_inode. */ + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); + if (ret) + goto out; + + ext4_xattr_inode_free_quota(inode, old_ea_inode, + le32_to_cpu(here->e_value_size)); + } + + /* No failures allowed past this point. */ + + if (!s->not_found && here->e_value_size && !here->e_value_inum) { + /* Remove the old value. */ + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + memmove(first_val + old_size, first_val, val - first_val); + memset(first_val, 0, old_size); + min_offs += old_size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + + if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = cpu_to_le16(o + old_size); + last = EXT4_XATTR_NEXT(last); + } + } + + if (!i->value) { + /* Remove old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + + last = ENTRY((void *)last - size); + memmove(here, (void *)here + size, + (void *)last - (void *)here + sizeof(__u32)); + memset(last, 0, size); + + /* + * Update i_inline_off - moved ibody region might contain + * system.data attribute. Handling a failure here won't + * cause other complications for setting an xattr. + */ + if (!is_block && ext4_has_inline_data(inode)) { + ret = ext4_find_inline_data_nolock(inode); + if (ret) { + ext4_warning_inode(inode, + "unable to update i_inline_off"); + goto out; + } + } + } else if (s->not_found) { + /* Insert new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)here + sizeof(__u32); + + memmove((void *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = i->name_index; + here->e_name_len = name_len; + memcpy(here->e_name, i->name, name_len); + } else { + /* This is an update, reset value info. */ + here->e_value_inum = 0; + here->e_value_offs = 0; + here->e_value_size = 0; + } + + if (i->value) { + /* Insert new value. */ + if (in_inode) { + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); + } else if (i->value_len) { + void *val = s->base + min_offs - new_size; + + here->e_value_offs = cpu_to_le16(min_offs - new_size); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, + new_size - i->value_len); + } + } + here->e_value_size = cpu_to_le32(i->value_len); + } + +update_hash: + if (i->value) { + __le32 hash = 0; + + /* Entry hash calculation. */ + if (in_inode) { + __le32 crc32c_hash; + + /* + * Feed crc32c hash instead of the raw value for entry + * hash calculation. This is to avoid walking + * potentially long value buffer again. + */ + crc32c_hash = cpu_to_le32( + ext4_xattr_inode_get_hash(new_ea_inode)); + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, + &crc32c_hash, 1); + } else if (is_block) { + __le32 *value = s->base + le16_to_cpu( + here->e_value_offs); + + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, value, + new_size >> 2); + } + here->e_hash = hash; + } + + if (is_block) + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + + ret = 0; +out: + iput(old_ea_inode); + return ret; +} + +struct ext4_xattr_block_find { + struct ext4_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT4_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bs->bh)) { + error = PTR_ERR(bs->bh); + bs->bh = NULL; + return error; + } + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + error = ext4_xattr_check_block(inode, bs->bh); + if (error) + return error; + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = xattr_find_entry(inode, &bs->s.here, bs->s.end, + i->name_index, i->name, 1); + if (error && error != -ENODATA) + return error; + bs->s.not_found = error; + } + return 0; +} + +static int +ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search s_copy = bs->s; + struct ext4_xattr_search *s = &s_copy; + struct mb_cache_entry *ce = NULL; + int error = 0; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + struct inode *ea_inode = NULL, *tmp_inode; + size_t old_ea_inode_quota = 0; + unsigned int ea_ino; + +#define header(x) ((struct ext4_xattr_header *)(x)) + + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + + if (s->base) { + int offset = (char *)s->here - bs->bh->b_data; + + BUFFER_TRACE(bs->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, sb, bs->bh, + EXT4_JTR_NONE); + if (error) + goto cleanup; + + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect modified + * block + */ + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, + hash, bs->bh->b_blocknr); + if (oe) { + /* + * Xattr block is getting reused. Leave + * it alone. + */ + mb_cache_entry_put(ea_block_cache, oe); + goto clone_block; + } + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s, handle, inode, + ea_inode, true /* is_block */); + ext4_xattr_block_csum_set(inode, bs->bh); + unlock_buffer(bs->bh); + if (error == -EFSCORRUPTED) + goto bad_block; + if (!error) + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } +clone_block: + unlock_buffer(bs->bh); + ea_bdebug(bs->bh, "cloning"); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) + goto cleanup; + + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( + s->here->e_value_size); + } + iput(tmp_inode); + + s->here->e_value_inum = 0; + s->here->e_value_size = 0; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); + if (error == -EFSCORRUPTED) + goto bad_block; + if (error) + goto cleanup; + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; + goto cleanup; + } + + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + u32 ref; + +#ifdef EXT4_XATTR_DEBUG + WARN_ON_ONCE(dquot_initialize_needed(inode)); +#endif + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, + EXT4_C2B(EXT4_SB(sb), 1)); + if (error) + goto cleanup; + BUFFER_TRACE(new_bh, "get_write_access"); + error = ext4_journal_get_write_access( + handle, sb, new_bh, + EXT4_JTR_NONE); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + /* + * We have to be careful about races with + * adding references to xattr block. Once we + * hold buffer lock xattr block's state is + * stable so we can check the additional + * reference fits. + */ + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + if (ref > EXT4_XATTR_REFCOUNT_MAX) { + /* + * Undo everything and check mbcache + * again. + */ + unlock_buffer(new_bh); + dquot_free_block(inode, + EXT4_C2B(EXT4_SB(sb), + 1)); + brelse(new_bh); + mb_cache_entry_put(ea_block_cache, ce); + ce = NULL; + new_bh = NULL; + goto inserted; + } + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX) + clear_bit(MBE_REUSABLE_B, &ce->e_flags); + ea_bdebug(new_bh, "reusing; refcount now=%d", + ref); + ext4_xattr_block_csum_set(inode, new_bh); + unlock_buffer(new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext4_fsblk_t goal, block; + +#ifdef EXT4_XATTR_DEBUG + WARN_ON_ONCE(dquot_initialize_needed(inode)); +#endif + goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); + if (error) + goto cleanup; + + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); + + new_bh = sb_getblk(sb, block); + if (unlikely(!new_bh)) { + error = -ENOMEM; +getblk_failed: + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA); + goto cleanup; + } + error = ext4_xattr_inode_inc_ref_all(handle, inode, + ENTRY(header(s->base)+1)); + if (error) + goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } + + lock_buffer(new_bh); + error = ext4_journal_get_create_access(handle, sb, + new_bh, EXT4_JTR_NONE); + if (error) { + unlock_buffer(new_bh); + error = -EIO; + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + ext4_xattr_block_csum_set(inode, new_bh); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext4_xattr_block_cache_insert(ea_block_cache, new_bh); + error = ext4_handle_dirty_metadata(handle, inode, + new_bh); + if (error) + goto cleanup; + } + } + + if (old_ea_inode_quota) + ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); + + /* Update the inode. */ + EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) { + struct ext4_xattr_inode_array *ea_inode_array = NULL; + + ext4_xattr_release_block(handle, inode, bs->bh, + &ea_inode_array, + 0 /* extra_credits */); + ext4_xattr_inode_array_free(ea_inode_array); + } + error = 0; + +cleanup: + if (ea_inode) { + if (error) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + } + iput(ea_inode); + } + if (ce) + mb_cache_entry_put(ea_block_cache, ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); + goto cleanup; + +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int error; + + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return 0; + + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = ITAIL(inode, raw_inode); + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + /* Find the named attribute. */ + error = xattr_find_entry(inode, &is->s.here, is->s.end, + i->name_index, i->name, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; + int error; + + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return -ENOSPC; + + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + iput(ea_inode); + return 0; +} + +static int ext4_xattr_value_same(struct ext4_xattr_search *s, + struct ext4_xattr_info *i) +{ + void *value; + + /* When e_value_inum is set the value is stored externally. */ + if (s->here->e_value_inum) + return 0; + if (le32_to_cpu(s->here->e_value_size) != i->value_len) + return 0; + value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); + return !memcmp(value, i->value, i->value_len); +} + +static struct buffer_head *ext4_xattr_get_block(struct inode *inode) +{ + struct buffer_head *bh; + int error; + + if (!EXT4_I(inode)->i_file_acl) + return NULL; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return bh; + error = ext4_xattr_check_block(inode, bh); + if (error) { + brelse(bh); + return ERR_PTR(error); + } + return bh; +} + +/* + * ext4_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext4_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + .in_inode = 0, + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + int no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + + ext4_write_lock_xattr(inode, &no_expand); + + /* Check journal credits under write lock. */ + if (ext4_handle_valid(handle)) { + struct buffer_head *bh; + int credits; + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + + credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, + flags & XATTR_CREATE); + brelse(bh); + + if (jbd2_handle_buffer_credits(handle) < credits) { + error = -ENOSPC; + goto cleanup; + } + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); + } + + error = ext4_reserve_inode_write(handle, inode, &is.iloc); + if (error) + goto cleanup; + + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + } + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { + error = 0; + /* Xattr value did not change? Save us some work and bail out */ + if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) + goto cleanup; + if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) + goto cleanup; + + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i.value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + i.in_inode = 1; +retry_inode: + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + brelse(bs.bh); + bs.bh = NULL; + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (!error && !is.s.not_found) { + i.value = NULL; + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } else if (error == -ENOSPC) { + /* + * Xattr does not fit in the block, store at + * external inode if possible. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + i.value_len && !i.in_inode) { + i.in_inode = 1; + goto retry_inode; + } + } + } + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + if (!value) + no_expand = 0; + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + ext4_write_unlock_xattr(inode, &no_expand); + return error; +} + +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +{ + struct buffer_head *bh; + int err; + + *credits = 0; + + if (!EXT4_SB(inode->i_sb)->s_journal) + return 0; + + down_read(&EXT4_I(inode)->xattr_sem); + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + } else { + *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, is_create); + brelse(bh); + err = 0; + } + + up_read(&EXT4_I(inode)->xattr_sem); + return err; +} + +/* + * ext4_xattr_set() + * + * Like ext4_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + struct super_block *sb = inode->i_sb; + int error, retries = 0; + int credits; + + error = dquot_initialize(inode); + if (error) + return error; + +retry: + error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext4_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* We always shift xattr headers further thus offsets get lower */ + BUG_ON(value_offs_shift > 0); + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_inum && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Move xattr pointed to by 'entry' from inode into external xattr block + */ +static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t value_size = le32_to_cpu(entry->e_value_size); + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + .name_index = entry->e_name_index, + .in_inode = !!entry->e_value_inum, + }; + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int needs_kvfree = 0; + int error; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!is || !bs || !b_entry_name) { + error = -ENOMEM; + goto out; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + /* Save the entry name and the entry value */ + if (entry->e_value_inum) { + buffer = kvmalloc(value_size, GFP_NOFS); + if (!buffer) { + error = -ENOMEM; + goto out; + } + needs_kvfree = 1; + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); + if (error) + goto out; + } else { + size_t value_offs = le16_to_cpu(entry->e_value_offs); + buffer = (void *)IFIRST(header) + value_offs; + } + + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto out; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto out; + + i.value = buffer; + i.value_len = value_size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto out; + + /* Move ea entry from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto out; + + /* Remove the chosen entry from the inode */ + i.value = NULL; + i.value_len = 0; + error = ext4_xattr_ibody_set(handle, inode, &i, is); + +out: + kfree(b_entry_name); + if (needs_kvfree && buffer) + kvfree(buffer); + if (is) + brelse(is->iloc.bh); + if (bs) + brelse(bs->bh); + kfree(is); + kfree(bs); + + return error; +} + +static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + int isize_diff, size_t ifree, + size_t bfree, int *total_ino) +{ + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + struct ext4_xattr_entry *small_entry; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *last; + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ + unsigned int min_total_size; + int error; + + while (isize_diff > ifree) { + entry = NULL; + small_entry = NULL; + min_total_size = ~0U; + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + /* never move system.data out of the inode */ + if ((last->e_name_len == 4) && + (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && + !memcmp(last->e_name, "data", 4)) + continue; + total_size = EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(last->e_value_size)); + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry == NULL) + return -ENOSPC; + entry = small_entry; + } + + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size; + if (!entry->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); + if (error) + return error; + + *total_ino -= entry_size; + ifree += total_size; + bfree -= total_size; + } + + return 0; +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + size_t min_offs; + size_t ifree, bfree; + int total_ino; + void *base, *end; + int error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ + +retry: + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + header = IHDR(inode, raw_inode); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = IFIRST(header); + end = ITAIL(inode, raw_inode); + min_offs = end - base; + total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); + + ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); + if (ifree >= isize_diff) + goto shift; + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + struct buffer_head *bh; + + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) { + brelse(bh); + goto cleanup; + } + base = BHDR(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, + NULL); + brelse(bh); + if (bfree + ifree < isize_diff) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + error = -ENOSPC; + goto cleanup; + } + } else { + bfree = inode->i_sb->s_blocksize; + } + + error = ext4_xattr_make_inode_space(handle, inode, raw_inode, + isize_diff, ifree, bfree, + &total_ino); + if (error) { + if (error == -ENOSPC && !tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + goto cleanup; + } +shift: + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + + if (ext4_has_inline_data(inode)) + error = ext4_find_inline_data_nolock(inode); + +cleanup: + if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + inode->i_ino); + mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); + } + return error; +} + +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) + +/* Add the large xattr @inode into @ea_inode_array for deferred iput(). + * If @ea_inode_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode) +{ + if (*ea_inode_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + */ + (*ea_inode_array) = kmalloc( + struct_size(*ea_inode_array, inodes, EIA_MASK), + GFP_NOFS); + if (*ea_inode_array == NULL) + return -ENOMEM; + (*ea_inode_array)->count = 0; + } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_inode_array *new_array = NULL; + + new_array = kmalloc( + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count + EIA_INCR), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *ea_inode_array, + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count)); + kfree(*ea_inode_array); + *ea_inode_array = new_array; + } + (*ea_inode_array)->count++; + (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; + return 0; +} + +/* + * ext4_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. Traverse + * all entries and decrement reference on any xattr inodes associated with this + * inode. This is called immediately before an inode is freed. We have exclusive + * access to the inode. If an orphan inode is deleted it will also release its + * references on xattr block and xattr inodes. + */ +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_iloc iloc = { .bh = NULL }; + struct ext4_xattr_entry *entry; + struct inode *ea_inode; + int error; + + error = ext4_journal_ensure_credits(handle, extra_credits, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (error < 0) { + EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); + goto cleanup; + } + + if (ext4_has_feature_ea_inode(inode->i_sb) && + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); + goto cleanup; + } + + error = ext4_journal_get_write_access(handle, inode->i_sb, + iloc.bh, EXT4_JTR_NONE); + if (error) { + EXT4_ERROR_INODE(inode, "write access (error %d)", + error); + goto cleanup; + } + + header = IHDR(inode, ext4_raw_inode(&iloc)); + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, + IFIRST(header), + false /* block_csum */, + ea_inode_array, + extra_credits, + false /* skip_quota */); + } + + if (EXT4_I(inode)->i_file_acl) { + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + if (error == -EIO) { + EXT4_ERROR_INODE_ERR(inode, EIO, + "block %llu read error", + EXT4_I(inode)->i_file_acl); + } + bh = NULL; + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) + goto cleanup; + + if (ext4_has_feature_ea_inode(inode->i_sb)) { + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + error = ext4_xattr_inode_iget(inode, + le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), + &ea_inode); + if (error) + continue; + ext4_xattr_inode_free_quota(inode, ea_inode, + le32_to_cpu(entry->e_value_size)); + iput(ea_inode); + } + + } + + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + extra_credits); + /* + * Update i_file_acl value in the same transaction that releases + * block. + */ + EXT4_I(inode)->i_file_acl = 0; + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); + } + error = 0; +cleanup: + brelse(iloc.bh); + brelse(bh); + return error; +} + +void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) +{ + int idx; + + if (ea_inode_array == NULL) + return; + + for (idx = 0; idx < ea_inode_array->count; ++idx) + iput(ea_inode_array->inodes[idx]); + kfree(ea_inode_array); +} + +/* + * ext4_xattr_block_cache_insert() + * + * Create a new entry in the extended attribute block cache, and insert + * it unless such an entry is already in the cache. + */ +static void +ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, + struct buffer_head *bh) +{ + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); + int reusable = le32_to_cpu(header->h_refcount) < + EXT4_XATTR_REFCOUNT_MAX; + int error; + + if (!ea_block_cache) + return; + error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, + bh->b_blocknr, reusable); + if (error) { + if (error == -EBUSY) + ea_bdebug(bh, "already in cache"); + } else + ea_bdebug(bh, "inserting [%x]", (int)hash); +} + +/* + * ext4_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ. + */ +static int +ext4_xattr_cmp(struct ext4_xattr_header *header1, + struct ext4_xattr_header *header2) +{ + struct ext4_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (!entry1->e_value_inum && + memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT4_XATTR_NEXT(entry1); + entry2 = EXT4_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext4_xattr_block_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. + */ +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *inode, + struct ext4_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + + if (!ea_block_cache) + return NULL; + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); + while (ce) { + struct buffer_head *bh; + + bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); + if (IS_ERR(bh)) { + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; + } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ea_block_cache, ce); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext4_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count) +{ + __u32 hash = 0; + + while (name_len--) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + (unsigned char)*name++; + } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + return cpu_to_le32(hash); +} + +/* + * ext4_xattr_hash_entry_signed() + * + * Compute the hash of an extended attribute incorrectly. + */ +static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count) +{ + __u32 hash = 0; + + while (name_len--) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + (signed char)*name++; + } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + return cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext4_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext4_xattr_rehash(struct ext4_xattr_header *header) +{ + struct ext4_xattr_entry *here; + __u32 hash = 0; + + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT4_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(void) +{ + return mb_cache_create(HASH_BUCKET_BITS); +} + +void ext4_xattr_destroy_cache(struct mb_cache *cache) +{ + if (cache) + mb_cache_destroy(cache); +} + diff --git a/fs/ext4l/xattr.h b/fs/ext4l/xattr.h new file mode 100644 index 00000000000..1fedf44d4fb --- /dev/null +++ b/fs/ext4l/xattr.h @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + File: fs/ext4/xattr.h + + On-disk format of extended attributes for the ext4 filesystem. + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/xattr.h> + +/* Magic value in attribute blocks */ +#define EXT4_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT4_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT4_XATTR_INDEX_USER 1 +#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT4_XATTR_INDEX_TRUSTED 4 +#define EXT4_XATTR_INDEX_LUSTRE 5 +#define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 +#define EXT4_XATTR_INDEX_ENCRYPTION 9 +#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */ + +struct ext4_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */ + __u32 h_reserved[3]; /* zero right now */ +}; + +struct ext4_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_inum; /* inode in which the value is stored */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[]; /* attribute name */ +}; + +#define EXT4_XATTR_PAD_BITS 2 +#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS) +#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1) +#define EXT4_XATTR_LEN(name_len) \ + (((name_len) + EXT4_XATTR_ROUND + \ + sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) +#define EXT4_XATTR_NEXT(entry) \ + ((struct ext4_xattr_entry *)( \ + (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) +#define EXT4_XATTR_SIZE(size) \ + (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) + +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + +/* + * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking + * for file system consistency errors, we use a somewhat bigger value. + * This allows XATTR_SIZE_MAX to grow in the future, but by using this + * instead of INT_MAX for certain consistency checks, we don't need to + * worry about arithmetic overflows. (Actually XATTR_SIZE_MAX is + * defined in include/uapi/linux/limits.h, so changing it is going + * not going to be trivial....) + */ +#define EXT4_XATTR_SIZE_MAX (1 << 24) + +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes + */ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) + +/* + * If we want to add an xattr to the inode, we should make sure that + * i_extra_isize is not 0 and that the inode size is not less than + * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad. + * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data + * |--------------------------|------------|------|---------|---|-------| + */ +#define EXT4_INODE_HAS_XATTR_SPACE(inode) \ + ((EXT4_I(inode)->i_extra_isize != 0) && \ + (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \ + sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \ + EXT4_INODE_SIZE((inode)->i_sb))) + +struct ext4_xattr_info { + const char *name; + const void *value; + size_t value_len; + int name_index; + int in_inode; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +struct ext4_xattr_inode_array { + unsigned int count; + struct inode *inodes[] __counted_by(count); +}; + +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_hurd_handler; + +#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" + +/* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. + * The first is to signal that there the inline xattrs and data are + * taking up so much space that we might as well not keep trying to + * expand it. The second is that xattr_sem is taken for writing, so + * we shouldn't try to recurse into the inode expansion. For this + * second case, we need to make sure that we take save and restore the + * NO_EXPAND state flag appropriately. + */ +static inline void ext4_write_lock_xattr(struct inode *inode, int *save) +{ + down_write(&EXT4_I(inode)->xattr_sem); + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); +} + +static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) +{ + if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) + return 0; + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + return 1; +} + +static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) +{ + if (*save == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); +} + +extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); + +extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits); +extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create); + +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **array, + int extra_credits); +extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); + +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); +extern void ext4_evict_ea_inode(struct inode *inode); + +extern const struct xattr_handler * const ext4_xattr_handlers[]; + +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + +extern struct mb_cache *ext4_xattr_create_cache(void); +extern void ext4_xattr_destroy_cache(struct mb_cache *); + +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + +#ifdef CONFIG_EXT4_FS_SECURITY +extern int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif + +#ifdef CONFIG_LOCKDEP +extern void ext4_xattr_inode_set_class(struct inode *ea_inode); +#else +static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } +#endif + +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage); diff --git a/fs/ext4l/xattr_hurd.c b/fs/ext4l/xattr_hurd.c new file mode 100644 index 00000000000..8a5842e4cd9 --- /dev/null +++ b/fs/ext4l/xattr_hurd.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_hurd.c + * Handler for extended gnu attributes for the Hurd. + * + * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org> + */ + +#include <linux/init.h> +#include <linux/string.h> +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_hurd_list(struct dentry *dentry) +{ + return test_opt(dentry->d_sb, XATTR_USER); +} + +static int +ext4_xattr_hurd_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD, + name, buffer, size); +} + +static int +ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_hurd_handler = { + .prefix = XATTR_HURD_PREFIX, + .list = ext4_xattr_hurd_list, + .get = ext4_xattr_hurd_get, + .set = ext4_xattr_hurd_set, +}; diff --git a/fs/ext4l/xattr_security.c b/fs/ext4l/xattr_security.c new file mode 100644 index 00000000000..776cf11d24c --- /dev/null +++ b/fs/ext4l/xattr_security.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/security.h> +#include <linux/slab.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static int +ext4_xattr_security_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext4_xattr_security_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err < 0) + break; + } + return err; +} + +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + +const struct xattr_handler ext4_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = ext4_xattr_security_get, + .set = ext4_xattr_security_set, +}; diff --git a/fs/ext4l/xattr_trusted.c b/fs/ext4l/xattr_trusted.c new file mode 100644 index 00000000000..9811eb0ab27 --- /dev/null +++ b/fs/ext4l/xattr_trusted.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + */ + +#include <linux/string.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static int +ext4_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext4_xattr_trusted_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext4_xattr_trusted_list, + .get = ext4_xattr_trusted_get, + .set = ext4_xattr_trusted_set, +}; diff --git a/fs/ext4l/xattr_user.c b/fs/ext4l/xattr_user.c new file mode 100644 index 00000000000..4b70bf4e762 --- /dev/null +++ b/fs/ext4l/xattr_user.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + */ + +#include <linux/string.h> +#include <linux/fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_user_list(struct dentry *dentry) +{ + return test_opt(dentry->d_sb, XATTR_USER); +} + +static int +ext4_xattr_user_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext4_xattr_user_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext4_xattr_user_list, + .get = ext4_xattr_user_get, + .set = ext4_xattr_user_set, +}; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Copy include/linux/jbd2.h from Linux v6.18. This header defines the JBD2 journaling layer interface used by ext4. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- include/linux/jbd2.h | 1815 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1815 insertions(+) create mode 100644 include/linux/jbd2.h diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h new file mode 100644 index 00000000000..43b9297fe8a --- /dev/null +++ b/include/linux/jbd2.h @@ -0,0 +1,1815 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * linux/include/linux/jbd2.h + * + * Written by Stephen C. Tweedie <sct@redhat.com> + * + * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved + * + * Definitions for transaction data structures for the buffer cache + * filesystem journaling support. + */ + +#ifndef _LINUX_JBD2_H +#define _LINUX_JBD2_H + +/* Allow this file to be included directly into e2fsprogs */ +#ifndef __KERNEL__ +#include "jfs_compat.h" +#define JBD2_DEBUG +#else + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/journal-head.h> +#include <linux/stddef.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/slab.h> +#include <linux/bit_spinlock.h> +#include <linux/blkdev.h> +#include <linux/crc32c.h> +#endif + +#define journal_oom_retry 1 + +/* + * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds + * certain classes of error which can occur due to failed IOs. Under + * normal use we want ext4 to continue after such errors, because + * hardware _can_ fail, but for debugging purposes when running tests on + * known-good hardware we may want to trap these errors. + */ +#undef JBD2_PARANOID_IOFAIL + +/* + * The default maximum commit age, in seconds. + */ +#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 + +#ifdef CONFIG_JBD2_DEBUG +/* + * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal + * consistency checks. By default we don't do this unless + * CONFIG_JBD2_DEBUG is on. + */ +#define JBD2_EXPENSIVE_CHECKING +void __jbd2_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...); + +#define jbd2_debug(n, fmt, a...) \ + __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) +#else +#define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) +#endif + +extern void *jbd2_alloc(size_t size, gfp_t flags); +extern void jbd2_free(void *ptr, size_t size); + +#define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 + +#ifdef __KERNEL__ + +/** + * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. + * + * All filesystem modifications made by the process go + * through this handle. Recursive operations (such as quota operations) + * are gathered into a single update. + * + * The buffer credits field is used to account for journaled buffers + * being modified by the running process. To ensure that there is + * enough log space for all outstanding operations, we need to limit the + * number of outstanding buffers possible at any time. When the + * operation completes, any buffer credits not used are credited back to + * the transaction, so that at all times we know how many buffers the + * outstanding updates on a transaction might possibly touch. + * + * This is an opaque datatype. + **/ +typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ + + +/** + * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. + * + * journal_t is linked to from the fs superblock structure. + * + * We use the journal_t to keep track of all outstanding transaction + * activity on the filesystem, and to manage the state of the log + * writing process. + * + * This is an opaque datatype. + **/ +typedef struct journal_s journal_t; /* Journal control structure */ +#endif + +/* + * Internal structures used by the logging mechanism: + */ + +#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ + +/* + * On-disk structures + */ + +/* + * Descriptor block types: + */ + +#define JBD2_DESCRIPTOR_BLOCK 1 +#define JBD2_COMMIT_BLOCK 2 +#define JBD2_SUPERBLOCK_V1 3 +#define JBD2_SUPERBLOCK_V2 4 +#define JBD2_REVOKE_BLOCK 5 + +/* + * Standard header for all descriptor blocks: + */ +typedef struct journal_header_s +{ + __be32 h_magic; + __be32 h_blocktype; + __be32 h_sequence; +} journal_header_t; + +/* + * Checksum types. + */ +#define JBD2_CRC32_CHKSUM 1 +#define JBD2_MD5_CHKSUM 2 +#define JBD2_SHA1_CHKSUM 3 +#define JBD2_CRC32C_CHKSUM 4 + +#define JBD2_CRC32_CHKSUM_SIZE 4 + +#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) +/* + * Commit block header for storing transactional checksums: + * + * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* + * fields are used to store a checksum of the descriptor and data blocks. + * + * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum + * field is used to store crc32c(uuid+commit_block). Each journal metadata + * block gets its own checksum, and data block checksums are stored in + * journal_block_tag (in the descriptor). The other h_chksum* fields are + * not used. + * + * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses + * journal_block_tag3_t to store a full 32-bit checksum. Everything else + * is the same as v2. + * + * Checksum v1, v2, and v3 are mutually exclusive features. + */ +struct commit_header { + __be32 h_magic; + __be32 h_blocktype; + __be32 h_sequence; + unsigned char h_chksum_type; + unsigned char h_chksum_size; + unsigned char h_padding[2]; + __be32 h_chksum[JBD2_CHECKSUM_BYTES]; + __be64 h_commit_sec; + __be32 h_commit_nsec; +}; + +/* + * The block tag: used to describe a single buffer in the journal. + * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this + * raw struct shouldn't be used for pointer math or sizeof() - use + * journal_tag_bytes(journal) instead to compute this. + */ +typedef struct journal_block_tag3_s +{ + __be32 t_blocknr; /* The on-disk block number */ + __be32 t_flags; /* See below */ + __be32 t_blocknr_high; /* most-significant high 32bits. */ + __be32 t_checksum; /* crc32c(uuid+seq+block) */ +} journal_block_tag3_t; + +typedef struct journal_block_tag_s +{ + __be32 t_blocknr; /* The on-disk block number */ + __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ + __be16 t_flags; /* See below */ + __be32 t_blocknr_high; /* most-significant high 32bits. */ +} journal_block_tag_t; + +/* Tail of descriptor or revoke block, for checksumming */ +struct jbd2_journal_block_tail { + __be32 t_checksum; /* crc32c(uuid+descr_block) */ +}; + +/* + * The revoke descriptor: used on disk to describe a series of blocks to + * be revoked from the log + */ +typedef struct jbd2_journal_revoke_header_s +{ + journal_header_t r_header; + __be32 r_count; /* Count of bytes used in the block */ +} jbd2_journal_revoke_header_t; + +/* Definitions for the journal tag flags word: */ +#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ +#define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ +#define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ +#define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ + + +/* + * The journal superblock. All fields are in big-endian byte order. + */ +typedef struct journal_superblock_s +{ +/* 0x0000 */ + journal_header_t s_header; + +/* 0x000C */ + /* Static information describing the journal */ + __be32 s_blocksize; /* journal device blocksize */ + __be32 s_maxlen; /* total blocks in journal file */ + __be32 s_first; /* first block of log information */ + +/* 0x0018 */ + /* Dynamic information describing the current state of the log */ + __be32 s_sequence; /* first commit ID expected in log */ + __be32 s_start; /* blocknr of start of log */ + +/* 0x0020 */ + /* Error value, as set by jbd2_journal_abort(). */ + __be32 s_errno; + +/* 0x0024 */ + /* Remaining fields are only valid in a version-2 superblock */ + __be32 s_feature_compat; /* compatible feature set */ + __be32 s_feature_incompat; /* incompatible feature set */ + __be32 s_feature_ro_compat; /* readonly-compatible feature set */ +/* 0x0030 */ + __u8 s_uuid[16]; /* 128-bit uuid for journal */ + +/* 0x0040 */ + __be32 s_nr_users; /* Nr of filesystems sharing log */ + + __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ + +/* 0x0048 */ + __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ + __be32 s_max_trans_data; /* Limit of data blocks per trans. */ + +/* 0x0050 */ + __u8 s_checksum_type; /* checksum type */ + __u8 s_padding2[3]; +/* 0x0054 */ + __be32 s_num_fc_blks; /* Number of fast commit blocks */ + __be32 s_head; /* blocknr of head of log, only uptodate + * while the filesystem is clean */ +/* 0x005C */ + __u32 s_padding[40]; + __be32 s_checksum; /* crc32c(superblock) */ + +/* 0x0100 */ + __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ +/* 0x0400 */ +} journal_superblock_t; + +#define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 + +#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 +#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 +#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 +#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 +#define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 +#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 + +/* See "journal feature predicate functions" below */ + +/* Features known to this kernel version: */ +#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM +#define JBD2_KNOWN_ROCOMPAT_FEATURES 0 +#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ + JBD2_FEATURE_INCOMPAT_64BIT | \ + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ + JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ + JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ + JBD2_FEATURE_INCOMPAT_FAST_COMMIT) + +#ifdef __KERNEL__ + +#include <linux/fs.h> +#include <linux/sched.h> + +enum jbd_state_bits { + BH_JBD /* Has an attached ext3 journal_head */ + = BH_PrivateStart, + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ + BH_Freed, /* Has been freed (truncated) */ + BH_Revoked, /* Has been revoked from the log */ + BH_RevokeValid, /* Revoked flag is valid */ + BH_JBDDirty, /* Is dirty but journaled */ + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ + BH_Shadow, /* IO on shadow buffer is running */ + BH_Verified, /* Metadata block has been verified ok */ + BH_JBDPrivateStart, /* First bit available for private use by FS */ +}; + +BUFFER_FNS(JBD, jbd) +BUFFER_FNS(JWrite, jwrite) +BUFFER_FNS(JBDDirty, jbddirty) +TAS_BUFFER_FNS(JBDDirty, jbddirty) +BUFFER_FNS(Revoked, revoked) +TAS_BUFFER_FNS(Revoked, revoked) +BUFFER_FNS(RevokeValid, revokevalid) +TAS_BUFFER_FNS(RevokeValid, revokevalid) +BUFFER_FNS(Freed, freed) +BUFFER_FNS(Shadow, shadow) +BUFFER_FNS(Verified, verified) + +static inline struct buffer_head *jh2bh(struct journal_head *jh) +{ + return jh->b_bh; +} + +static inline struct journal_head *bh2jh(struct buffer_head *bh) +{ + return bh->b_private; +} + +static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) +{ + bit_spin_lock(BH_JournalHead, &bh->b_state); +} + +static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) +{ + bit_spin_unlock(BH_JournalHead, &bh->b_state); +} + +#define J_ASSERT(assert) BUG_ON(!(assert)) + +#define J_ASSERT_BH(bh, expr) J_ASSERT(expr) +#define J_ASSERT_JH(jh, expr) J_ASSERT(expr) + +#if defined(JBD2_PARANOID_IOFAIL) +#define J_EXPECT(expr, why...) J_ASSERT(expr) +#define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) +#define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) +#else +#define __journal_expect(expr, why...) \ + ({ \ + int val = (expr); \ + if (!val) { \ + printk(KERN_ERR \ + "JBD2 unexpected failure: %s: %s;\n", \ + __func__, #expr); \ + printk(KERN_ERR why "\n"); \ + } \ + val; \ + }) +#define J_EXPECT(expr, why...) __journal_expect(expr, ## why) +#define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) +#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) +#endif + +/* Flags in jbd_inode->i_flags */ +#define __JI_COMMIT_RUNNING 0 +#define __JI_WRITE_DATA 1 +#define __JI_WAIT_DATA 2 + +/* + * Commit of the inode data in progress. We use this flag to protect us from + * concurrent deletion of inode. We cannot use reference to inode for this + * since we cannot afford doing last iput() on behalf of kjournald + */ +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) +/* Write allocated dirty buffers in this inode before commit */ +#define JI_WRITE_DATA (1 << __JI_WRITE_DATA) +/* Wait for outstanding data writes for this inode before commit */ +#define JI_WAIT_DATA (1 << __JI_WAIT_DATA) + +/** + * struct jbd2_inode - The jbd_inode type is the structure linking inodes in + * ordered mode present in a transaction so that we can sync them during commit. + */ +struct jbd2_inode { + /** + * @i_transaction: + * + * Which transaction does this inode belong to? Either the running + * transaction or the committing one. [j_list_lock] + */ + transaction_t *i_transaction; + + /** + * @i_next_transaction: + * + * Pointer to the running transaction modifying inode's data in case + * there is already a committing transaction touching it. [j_list_lock] + */ + transaction_t *i_next_transaction; + + /** + * @i_list: List of inodes in the i_transaction [j_list_lock] + */ + struct list_head i_list; + + /** + * @i_vfs_inode: + * + * VFS inode this inode belongs to [constant for lifetime of structure] + */ + struct inode *i_vfs_inode; + + /** + * @i_flags: Flags of inode [j_list_lock] + */ + unsigned long i_flags; + + /** + * @i_dirty_start: + * + * Offset in bytes where the dirty range for this inode starts. + * [j_list_lock] + */ + loff_t i_dirty_start; + + /** + * @i_dirty_end: + * + * Inclusive offset in bytes where the dirty range for this inode + * ends. [j_list_lock] + */ + loff_t i_dirty_end; +}; + +struct jbd2_revoke_table_s; + +/** + * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete + * type associated with handle_t. + * @h_transaction: Which compound transaction is this update a part of? + * @h_journal: Which journal handle belongs to - used iff h_reserved set. + * @h_rsv_handle: Handle reserved for finishing the logical operation. + * @h_total_credits: Number of remaining buffers we are allowed to add to + * journal. These are dirty buffers and revoke descriptor blocks. + * @h_revoke_credits: Number of remaining revoke records available for handle + * @h_ref: Reference count on this handle. + * @h_err: Field for caller's use to track errors through large fs operations. + * @h_sync: Flag for sync-on-close. + * @h_reserved: Flag for handle for reserved credits. + * @h_aborted: Flag indicating fatal error on handle. + * @h_type: For handle statistics. + * @h_line_no: For handle statistics. + * @h_start_jiffies: Handle Start time. + * @h_requested_credits: Holds @h_total_credits after handle is started. + * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. + * @saved_alloc_context: Saved context while transaction is open. + **/ + +/* Docbook can't yet cope with the bit fields, but will leave the documentation + * in so it can be fixed later. + */ + +struct jbd2_journal_handle +{ + union { + transaction_t *h_transaction; + /* Which journal handle belongs to - used iff h_reserved set */ + journal_t *h_journal; + }; + + handle_t *h_rsv_handle; + int h_total_credits; + int h_revoke_credits; + int h_revoke_credits_requested; + int h_ref; + int h_err; + + /* Flags [no locking] */ + unsigned int h_sync: 1; + unsigned int h_reserved: 1; + unsigned int h_aborted: 1; + unsigned int h_type: 8; + unsigned int h_line_no: 16; + + unsigned long h_start_jiffies; + unsigned int h_requested_credits; + + unsigned int saved_alloc_context; +}; + + +/* + * Some stats for checkpoint phase + */ +struct transaction_chp_stats_s { + unsigned long cs_chp_time; + __u32 cs_forced_to_close; + __u32 cs_written; + __u32 cs_dropped; +}; + +/* The transaction_t type is the guts of the journaling mechanism. It + * tracks a compound transaction through its various states: + * + * RUNNING: accepting new updates + * LOCKED: Updates still running but we don't accept new ones + * RUNDOWN: Updates are tidying up but have finished requesting + * new buffers to modify (state not used for now) + * FLUSH: All updates complete, but we are still writing to disk + * COMMIT: All data on disk, writing commit record + * FINISHED: We still have to keep the transaction for checkpointing. + * + * The transaction keeps track of all of the buffers modified by a + * running transaction, and all of the buffers committed but not yet + * flushed to home for finished transactions. + * (Locking Documentation improved by LockDoc) + */ + +/* + * Lock ranking: + * + * j_list_lock + * ->jbd_lock_bh_journal_head() (This is "innermost") + * + * j_state_lock + * ->b_state_lock + * + * b_state_lock + * ->j_list_lock + * + * j_state_lock + * ->j_list_lock (journal_unmap_buffer) + * + */ + +struct transaction_s +{ + /* Pointer to the journal for this transaction. [no locking] */ + journal_t *t_journal; + + /* Sequence number for this transaction [no locking] */ + tid_t t_tid; + + /* + * Transaction's current state + * [no locking - only kjournald2 alters this] + * [j_list_lock] guards transition of a transaction into T_FINISHED + * state and subsequent call of __jbd2_journal_drop_transaction() + * FIXME: needs barriers + * KLUDGE: [use j_state_lock] + */ + enum { + T_RUNNING, + T_LOCKED, + T_SWITCH, + T_FLUSH, + T_COMMIT, + T_COMMIT_DFLUSH, + T_COMMIT_JFLUSH, + T_COMMIT_CALLBACK, + T_FINISHED + } t_state; + + /* + * Where in the log does this transaction's commit start? [no locking] + */ + unsigned long t_log_start; + + /* + * Number of buffers on the t_buffers list [j_list_lock, no locks + * needed for jbd2 thread] + */ + int t_nr_buffers; + + /* + * Doubly-linked circular list of all buffers reserved but not yet + * modified by this transaction [j_list_lock, no locks needed fo + * jbd2 thread] + */ + struct journal_head *t_reserved_list; + + /* + * Doubly-linked circular list of all metadata buffers owned by this + * transaction [j_list_lock, no locks needed for jbd2 thread] + */ + struct journal_head *t_buffers; + + /* + * Doubly-linked circular list of all forget buffers (superseded + * buffers which we can un-checkpoint once this transaction commits) + * [j_list_lock] + */ + struct journal_head *t_forget; + + /* + * Doubly-linked circular list of all buffers still to be flushed before + * this transaction can be checkpointed. [j_list_lock] + */ + struct journal_head *t_checkpoint_list; + + /* + * Doubly-linked circular list of metadata buffers being + * shadowed by log IO. The IO buffers on the iobuf list and + * the shadow buffers on this list match each other one for + * one at all times. [j_list_lock, no locks needed for jbd2 + * thread] + */ + struct journal_head *t_shadow_list; + + /* + * List of inodes associated with the transaction; e.g., ext4 uses + * this to track inodes in data=ordered and data=journal mode that + * need special handling on transaction commit; also used by ocfs2. + * [j_list_lock] + */ + struct list_head t_inode_list; + + /* + * Longest time some handle had to wait for running transaction + */ + unsigned long t_max_wait; + + /* + * When transaction started + */ + unsigned long t_start; + + /* + * When commit was requested [j_state_lock] + */ + unsigned long t_requested; + + /* + * Checkpointing stats [j_list_lock] + */ + struct transaction_chp_stats_s t_chp_stats; + + /* + * Number of outstanding updates running on this transaction + * [none] + */ + atomic_t t_updates; + + /* + * Number of blocks reserved for this transaction in the journal. + * This is including all credits reserved when starting transaction + * handles as well as all journal descriptor blocks needed for this + * transaction. [none] + */ + atomic_t t_outstanding_credits; + + /* + * Number of revoke records for this transaction added by already + * stopped handles. [none] + */ + atomic_t t_outstanding_revokes; + + /* + * How many handles used this transaction? [none] + */ + atomic_t t_handle_count; + + /* + * Forward and backward links for the circular list of all transactions + * awaiting checkpoint. [j_list_lock] + */ + transaction_t *t_cpnext, *t_cpprev; + + /* + * When will the transaction expire (become due for commit), in jiffies? + * [no locking] + */ + unsigned long t_expires; + + /* + * When this transaction started, in nanoseconds [no locking] + */ + ktime_t t_start_time; + + /* + * This transaction is being forced and some process is + * waiting for it to finish. + */ + unsigned int t_synchronous_commit:1; + + /* Disk flush needs to be sent to fs partition [no locking] */ + int t_need_data_flush; +}; + +struct transaction_run_stats_s { + unsigned long rs_wait; + unsigned long rs_request_delay; + unsigned long rs_running; + unsigned long rs_locked; + unsigned long rs_flushing; + unsigned long rs_logging; + + __u32 rs_handle_count; + __u32 rs_blocks; + __u32 rs_blocks_logged; +}; + +struct transaction_stats_s { + unsigned long ts_tid; + unsigned long ts_requested; + struct transaction_run_stats_s run; +}; + +static inline unsigned long +jbd2_time_diff(unsigned long start, unsigned long end) +{ + if (end >= start) + return end - start; + + return end + (MAX_JIFFY_OFFSET - start); +} + +#define JBD2_NR_BATCH 64 + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; + +#define JBD2_FC_REPLAY_STOP 0 +#define JBD2_FC_REPLAY_CONTINUE 1 + +/** + * struct journal_s - The journal_s type is the concrete type associated with + * journal_t. + */ +struct journal_s +{ + /** + * @j_flags: General journaling state flags [j_state_lock, + * no lock for quick racy checks] + */ + unsigned long j_flags; + + /** + * @j_errno: + * + * Is there an outstanding uncleared error on the journal (from a prior + * abort)? [j_state_lock] + */ + int j_errno; + + /** + * @j_abort_mutex: Lock the whole aborting procedure. + */ + struct mutex j_abort_mutex; + + /** + * @j_sb_buffer: The first part of the superblock buffer. + */ + struct buffer_head *j_sb_buffer; + + /** + * @j_superblock: The second part of the superblock buffer. + */ + journal_superblock_t *j_superblock; + + /** + * @j_state_lock: Protect the various scalars in the journal. + */ + rwlock_t j_state_lock; + + /** + * @j_barrier_count: + * + * Number of processes waiting to create a barrier lock [j_state_lock, + * no lock for quick racy checks] + */ + int j_barrier_count; + + /** + * @j_barrier: The barrier lock itself. + */ + struct mutex j_barrier; + + /** + * @j_running_transaction: + * + * Transactions: The current running transaction... + * [j_state_lock, no lock for quick racy checks] [caller holding + * open handle] + */ + transaction_t *j_running_transaction; + + /** + * @j_committing_transaction: + * + * the transaction we are pushing to disk + * [j_state_lock] [caller holding open handle] + */ + transaction_t *j_committing_transaction; + + /** + * @j_checkpoint_transactions: + * + * ... and a linked circular list of all transactions waiting for + * checkpointing. [j_list_lock] + */ + transaction_t *j_checkpoint_transactions; + + /** + * @j_wait_transaction_locked: + * + * Wait queue for waiting for a locked transaction to start committing, + * or for a barrier lock to be released. + */ + wait_queue_head_t j_wait_transaction_locked; + + /** + * @j_wait_done_commit: Wait queue for waiting for commit to complete. + */ + wait_queue_head_t j_wait_done_commit; + + /** + * @j_wait_commit: Wait queue to trigger commit. + */ + wait_queue_head_t j_wait_commit; + + /** + * @j_wait_updates: Wait queue to wait for updates to complete. + */ + wait_queue_head_t j_wait_updates; + + /** + * @j_wait_reserved: + * + * Wait queue to wait for reserved buffer credits to drop. + */ + wait_queue_head_t j_wait_reserved; + + /** + * @j_fc_wait: + * + * Wait queue to wait for completion of async fast commits. + */ + wait_queue_head_t j_fc_wait; + + /** + * @j_checkpoint_mutex: + * + * Semaphore for locking against concurrent checkpoints. + */ + struct mutex j_checkpoint_mutex; + + /** + * @j_chkpt_bhs: + * + * List of buffer heads used by the checkpoint routine. This + * was moved from jbd2_log_do_checkpoint() to reduce stack + * usage. Access to this array is controlled by the + * @j_checkpoint_mutex. [j_checkpoint_mutex] + */ + struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; + + /** + * @j_shrinker: + * + * Journal head shrinker, reclaim buffer's journal head which + * has been written back. + */ + struct shrinker *j_shrinker; + + /** + * @j_checkpoint_jh_count: + * + * Number of journal buffers on the checkpoint list. [j_list_lock] + */ + struct percpu_counter j_checkpoint_jh_count; + + /** + * @j_shrink_transaction: + * + * Record next transaction will shrink on the checkpoint list. + * [j_list_lock] + */ + transaction_t *j_shrink_transaction; + + /** + * @j_head: + * + * Journal head: identifies the first unused block in the journal. + * [j_state_lock] + */ + unsigned long j_head; + + /** + * @j_tail: + * + * Journal tail: identifies the oldest still-used block in the journal. + * [j_state_lock] + */ + unsigned long j_tail; + + /** + * @j_free: + * + * Journal free: how many free blocks are there in the journal? + * [j_state_lock] + */ + unsigned long j_free; + + /** + * @j_first: + * + * The block number of the first usable block in the journal + * [j_state_lock]. + */ + unsigned long j_first; + + /** + * @j_last: + * + * The block number one beyond the last usable block in the journal + * [j_state_lock]. + */ + unsigned long j_last; + + /** + * @j_fc_first: + * + * The block number of the first fast commit block in the journal + * [j_state_lock]. + */ + unsigned long j_fc_first; + + /** + * @j_fc_off: + * + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. + */ + unsigned long j_fc_off; + + /** + * @j_fc_last: + * + * The block number one beyond the last fast commit block in the journal + * [j_state_lock]. + */ + unsigned long j_fc_last; + + /** + * @j_dev: Device where we store the journal. + */ + struct block_device *j_dev; + + /** + * @j_blocksize: Block size for the location where we store the journal. + */ + int j_blocksize; + + /** + * @j_blk_offset: + * + * Starting block offset into the device where we store the journal. + */ + unsigned long long j_blk_offset; + + /** + * @j_devname: Journal device name. + */ + char j_devname[BDEVNAME_SIZE+24]; + + /** + * @j_fs_dev: + * + * Device which holds the client fs. For internal journal this will be + * equal to j_dev. + */ + struct block_device *j_fs_dev; + + /** + * @j_fs_dev_wb_err: + * + * Records the errseq of the client fs's backing block device. + */ + errseq_t j_fs_dev_wb_err; + + /** + * @j_total_len: Total maximum capacity of the journal region on disk. + */ + unsigned int j_total_len; + + /** + * @j_reserved_credits: + * + * Number of buffers reserved from the running transaction. + */ + atomic_t j_reserved_credits; + + /** + * @j_list_lock: Protects the buffer lists and internal buffer state. + */ + spinlock_t j_list_lock; + + /** + * @j_inode: + * + * Optional inode where we store the journal. If present, all + * journal block numbers are mapped into this inode via bmap(). + */ + struct inode *j_inode; + + /** + * @j_tail_sequence: + * + * Sequence number of the oldest transaction in the log [j_state_lock] + */ + tid_t j_tail_sequence; + + /** + * @j_transaction_sequence: + * + * Sequence number of the next transaction to grant [j_state_lock] + */ + tid_t j_transaction_sequence; + + /** + * @j_commit_sequence: + * + * Sequence number of the most recently committed transaction + * [j_state_lock, no lock for quick racy checks] + */ + tid_t j_commit_sequence; + + /** + * @j_commit_request: + * + * Sequence number of the most recent transaction wanting commit + * [j_state_lock, no lock for quick racy checks] + */ + tid_t j_commit_request; + + /** + * @j_uuid: + * + * Journal uuid: identifies the object (filesystem, LVM volume etc) + * backed by this journal. This will eventually be replaced by an array + * of uuids, allowing us to index multiple devices within a single + * journal and to perform atomic updates across them. + */ + __u8 j_uuid[16]; + + /** + * @j_task: Pointer to the current commit thread for this journal. + */ + struct task_struct *j_task; + + /** + * @j_max_transaction_buffers: + * + * Maximum number of metadata buffers to allow in a single compound + * commit transaction. + */ + int j_max_transaction_buffers; + + /** + * @j_revoke_records_per_block: + * + * Number of revoke records that fit in one descriptor block. + */ + int j_revoke_records_per_block; + + /** + * @j_transaction_overhead_buffers: + * + * Number of blocks each transaction needs for its own bookkeeping + */ + int j_transaction_overhead_buffers; + + /** + * @j_commit_interval: + * + * What is the maximum transaction lifetime before we begin a commit? + */ + unsigned long j_commit_interval; + + /** + * @j_commit_timer: The timer used to wakeup the commit thread. + */ + struct timer_list j_commit_timer; + + /** + * @j_revoke_lock: Protect the revoke table. + */ + spinlock_t j_revoke_lock; + + /** + * @j_revoke: + * + * The revoke table - maintains the list of revoked blocks in the + * current transaction. + */ + struct jbd2_revoke_table_s *j_revoke; + + /** + * @j_revoke_table: Alternate revoke tables for j_revoke. + */ + struct jbd2_revoke_table_s *j_revoke_table[2]; + + /** + * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. + */ + struct buffer_head **j_wbuf; + + /** + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. + */ + struct buffer_head **j_fc_wbuf; + + /** + * @j_wbufsize: + * + * Size of @j_wbuf array. + */ + int j_wbufsize; + + /** + * @j_fc_wbufsize: + * + * Size of @j_fc_wbuf array. + */ + int j_fc_wbufsize; + + /** + * @j_last_sync_writer: + * + * The pid of the last person to run a synchronous operation + * through the journal. + */ + pid_t j_last_sync_writer; + + /** + * @j_average_commit_time: + * + * The average amount of time in nanoseconds it takes to commit a + * transaction to disk. [j_state_lock] + */ + u64 j_average_commit_time; + + /** + * @j_min_batch_time: + * + * Minimum time that we should wait for additional filesystem operations + * to get batched into a synchronous handle in microseconds. + */ + u32 j_min_batch_time; + + /** + * @j_max_batch_time: + * + * Maximum time that we should wait for additional filesystem operations + * to get batched into a synchronous handle in microseconds. + */ + u32 j_max_batch_time; + + /** + * @j_commit_callback: + * + * This function is called when a transaction is closed. + */ + void (*j_commit_callback)(journal_t *, + transaction_t *); + + /** + * @j_submit_inode_data_buffers: + * + * This function is called for all inodes associated with the + * committing transaction marked with JI_WRITE_DATA flag + * before we start to write out the transaction to the journal. + */ + int (*j_submit_inode_data_buffers) + (struct jbd2_inode *); + + /** + * @j_finish_inode_data_buffers: + * + * This function is called for all inodes associated with the + * committing transaction marked with JI_WAIT_DATA flag + * after we have written the transaction to the journal + * but before we write out the commit block. + */ + int (*j_finish_inode_data_buffers) + (struct jbd2_inode *); + + /* + * Journal statistics + */ + + /** + * @j_history_lock: Protect the transactions statistics history. + */ + spinlock_t j_history_lock; + + /** + * @j_proc_entry: procfs entry for the jbd statistics directory. + */ + struct proc_dir_entry *j_proc_entry; + + /** + * @j_stats: Overall statistics. + */ + struct transaction_stats_s j_stats; + + /** + * @j_failed_commit: Failed journal commit ID. + */ + unsigned int j_failed_commit; + + /** + * @j_private: + * + * An opaque pointer to fs-private information. ext3 puts its + * superblock pointer here. + */ + void *j_private; + + /** + * @j_csum_seed: + * + * Precomputed journal UUID checksum for seeding other checksums. + */ + __u32 j_csum_seed; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /** + * @j_trans_commit_map: + * + * Lockdep entity to track transaction commit dependencies. Handles + * hold this "lock" for read, when we wait for commit, we acquire the + * "lock" for writing. This matches the properties of jbd2 journalling + * where the running transaction has to wait for all handles to be + * dropped to commit that transaction and also acquiring a handle may + * require transaction commit to finish. + */ + struct lockdep_map j_trans_commit_map; +#endif + + /** + * @j_fc_cleanup_callback: + * + * Clean-up after fast commit or full commit. JBD2 calls this function + * after every commit operation. + */ + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); + + /** + * @j_fc_replay_callback: + * + * File-system specific function that performs replay of a fast + * commit. JBD2 calls this function for each fast commit block found in + * the journal. This function should return JBD2_FC_REPLAY_CONTINUE + * to indicate that the block was processed correctly and more fast + * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP + * indicates the end of replay (no more blocks remaining). A negative + * return value indicates error. + */ + int (*j_fc_replay_callback)(struct journal_s *journal, + struct buffer_head *bh, + enum passtype pass, int off, + tid_t expected_commit_id); + + /** + * @j_bmap: + * + * Bmap function that should be used instead of the generic + * VFS bmap function. + */ + int (*j_bmap)(struct journal_s *journal, sector_t *block); +}; + +#define jbd2_might_wait_for_commit(j) \ + do { \ + rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ + rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ + } while (0) + +/* + * We can support any known requested features iff the + * superblock is not in version 1. Otherwise we fail to support any + * extended sb features. + */ +static inline bool jbd2_format_support_feature(journal_t *j) +{ + return j->j_superblock->s_header.h_blocktype != + cpu_to_be32(JBD2_SUPERBLOCK_V1); +} + +/* journal feature predicate functions */ +#define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool jbd2_has_feature_##name(journal_t *j) \ +{ \ + return (jbd2_format_support_feature(j) && \ + ((j)->j_superblock->s_feature_compat & \ + cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void jbd2_set_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_compat |= \ + cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ +} \ +static inline void jbd2_clear_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_compat &= \ + ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ +} + +#define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool jbd2_has_feature_##name(journal_t *j) \ +{ \ + return (jbd2_format_support_feature(j) && \ + ((j)->j_superblock->s_feature_ro_compat & \ + cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void jbd2_set_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_ro_compat |= \ + cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void jbd2_clear_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_ro_compat &= \ + ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ +} + +#define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool jbd2_has_feature_##name(journal_t *j) \ +{ \ + return (jbd2_format_support_feature(j) && \ + ((j)->j_superblock->s_feature_incompat & \ + cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void jbd2_set_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_incompat |= \ + cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void jbd2_clear_feature_##name(journal_t *j) \ +{ \ + (j)->j_superblock->s_feature_incompat &= \ + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ +} + +JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) + +JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) +JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) +JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) +JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) +JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) + +/* Journal high priority write IO operation flags */ +#define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) + +/* + * Journal flag definitions + */ +#define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ +#define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ +#define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ +#define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ +#define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ +#define JBD2_BARRIER 0x020 /* Use IDE barriers */ +#define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on + * clean and empty filesystem + * logging area */ +#define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ +#define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ +#define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 +#define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 +#define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ + JBD2_JOURNAL_FLUSH_ZEROOUT) + +/* + * Function declarations for the journaling transaction and buffer + * management + */ + +/* Filing buffers */ +extern bool __jbd2_journal_refile_buffer(struct journal_head *); +extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); +extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); +extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); +static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, head); +} +static inline void jbd2_unfile_log_bh(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); +} + +/* Log buffer allocation */ +struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); +void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); +int jbd2_journal_next_log_block(journal_t *, unsigned long long *); +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block); +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); + +/* Commit management */ +extern void jbd2_journal_commit_transaction(journal_t *); + +/* Checkpoint list management */ +enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; + +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); +int __jbd2_journal_remove_checkpoint(struct journal_head *); +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); +void jbd2_journal_destroy_checkpoint(journal_t *journal); +void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); + + +/* + * Triggers + */ + +struct jbd2_buffer_trigger_type { + /* + * Fired a the moment data to write to the journal are known to be + * stable - so either at the moment b_frozen_data is created or just + * before a buffer is written to the journal. mapped_data is a mapped + * buffer that is the frozen data for commit. + */ + void (*t_frozen)(struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh, void *mapped_data, + size_t size); + + /* + * Fired during journal abort for dirty buffers that will not be + * committed. + */ + void (*t_abort)(struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh); +}; + +extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, + void *mapped_data, + struct jbd2_buffer_trigger_type *triggers); +extern void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers); + +/* Buffer IO */ +extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct buffer_head **bh_out, + sector_t blocknr); + +/* Transaction cache support */ +extern void jbd2_journal_destroy_transaction_cache(void); +extern int __init jbd2_journal_init_transaction_cache(void); +extern void jbd2_journal_free_transaction(transaction_t *); + +/* + * Journal locking. + * + * We need to lock the journal during transaction state changes so that nobody + * ever tries to take a handle on the running transaction while we are in the + * middle of moving it to the commit phase. j_state_lock does this. + * + * Note that the locking is completely interrupt unsafe. We never touch + * journal structures from interrupts. + */ + +static inline handle_t *journal_current_handle(void) +{ + return current->journal_info; +} + +/* The journaling code user interface: + * + * Create and destroy handles + * Register buffer modifications against the current transaction. + */ + +extern handle_t *jbd2_journal_start(journal_t *, int nblocks); +extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no); +extern int jbd2_journal_restart(handle_t *, int nblocks); +extern int jbd2__journal_restart(handle_t *, int nblocks, + int revoke_records, gfp_t gfp_mask); +extern int jbd2_journal_start_reserved(handle_t *handle, + unsigned int type, unsigned int line_no); +extern void jbd2_journal_free_reserved(handle_t *handle); +extern int jbd2_journal_extend(handle_t *handle, int nblocks, + int revoke_records); +extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); +extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); +extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); +void jbd2_journal_set_triggers(struct buffer_head *, + struct jbd2_buffer_trigger_type *type); +extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); +extern int jbd2_journal_forget (handle_t *, struct buffer_head *); +int jbd2_journal_invalidate_folio(journal_t *, struct folio *, + size_t offset, size_t length); +bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); +extern int jbd2_journal_stop(handle_t *); +extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); +extern void jbd2_journal_lock_updates (journal_t *); +extern void jbd2_journal_unlock_updates (journal_t *); + +void jbd2_journal_wait_updates(journal_t *); + +extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int bsize); +extern journal_t * jbd2_journal_init_inode (struct inode *); +extern int jbd2_journal_update_format (journal_t *); +extern int jbd2_journal_check_used_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_check_available_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_set_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern void jbd2_journal_clear_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_load (journal_t *journal); +extern int jbd2_journal_destroy (journal_t *); +extern int jbd2_journal_recover (journal_t *journal); +extern int jbd2_journal_wipe (journal_t *, int); +extern int jbd2_journal_skip_recovery (journal_t *); +extern void jbd2_journal_update_sb_errno(journal_t *); +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, + unsigned long, blk_opf_t); +extern void jbd2_journal_abort (journal_t *, int); +extern int jbd2_journal_errno (journal_t *); +extern void jbd2_journal_ack_err (journal_t *); +extern int jbd2_journal_clear_err (journal_t *); +extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); +extern int jbd2_journal_force_commit(journal_t *); +extern int jbd2_journal_force_commit_nested(journal_t *); +extern int jbd2_journal_inode_ranged_write(handle_t *handle, + struct jbd2_inode *inode, loff_t start_byte, + loff_t length); +extern int jbd2_journal_inode_ranged_wait(handle_t *handle, + struct jbd2_inode *inode, loff_t start_byte, + loff_t length); +extern int jbd2_journal_finish_inode_data_buffers( + struct jbd2_inode *jinode); +extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *inode, loff_t new_size); +extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); +extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); + +/* + * journal_head management + */ +struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); +struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); +void jbd2_journal_put_journal_head(struct journal_head *jh); + +/* + * handle management + */ +extern struct kmem_cache *jbd2_handle_cache; + +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ +#define jbd2_alloc_handle(_gfp_flags) \ + ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) + +static inline void jbd2_free_handle(handle_t *handle) +{ + kmem_cache_free(jbd2_handle_cache, handle); +} + +/* + * jbd2_inode management (optional, for those file systems that want to use + * dynamically allocated jbd2_inode structures) + */ +extern struct kmem_cache *jbd2_inode_cache; + +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ +#define jbd2_alloc_inode(_gfp_flags) \ + ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) + +static inline void jbd2_free_inode(struct jbd2_inode *jinode) +{ + kmem_cache_free(jbd2_inode_cache, jinode); +} + +/* Primary revoke support */ +#define JOURNAL_REVOKE_DEFAULT_HASH 256 +extern int jbd2_journal_init_revoke(journal_t *, int); +extern void jbd2_journal_destroy_revoke_record_cache(void); +extern void jbd2_journal_destroy_revoke_table_cache(void); +extern int __init jbd2_journal_init_revoke_record_cache(void); +extern int __init jbd2_journal_init_revoke_table_cache(void); +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size); +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); + +extern void jbd2_journal_destroy_revoke(journal_t *); +extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); +extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); +extern void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs); + +/* Recovery revoke support */ +extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); +extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); +extern void jbd2_journal_clear_revoke(journal_t *); +extern void jbd2_journal_switch_revoke_table(journal_t *journal); +extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); + +/* + * The log thread user interface: + * + * Request space in the current transaction, and force transaction commit + * transitions on demand. + */ + +int jbd2_log_start_commit(journal_t *journal, tid_t tid); +int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); +int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_transaction_committed(journal_t *journal, tid_t tid); +int jbd2_complete_transaction(journal_t *journal, tid_t tid); +int jbd2_log_do_checkpoint(journal_t *journal); +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); + +void __jbd2_log_wait_for_space(journal_t *journal); +extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); +extern int jbd2_cleanup_journal_tail(journal_t *); + +/* Fast commit related APIs */ +int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit(journal_t *journal); +int jbd2_fc_end_commit_fallback(journal_t *journal); +int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); +int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); +void jbd2_fc_release_bufs(journal_t *journal); + +/* + * is_journal_abort + * + * Simple test wrapper function to test the JBD2_ABORT state flag. This + * bit, when set, indicates that we have had a fatal error somewhere, + * either inside the journaling layer or indicated to us by the client + * (eg. ext3), and that we and should not commit any further + * transactions. + */ + +static inline int is_journal_aborted(journal_t *journal) +{ + return journal->j_flags & JBD2_ABORT; +} + +static inline int is_handle_aborted(handle_t *handle) +{ + if (handle->h_aborted || !handle->h_transaction) + return 1; + return is_journal_aborted(handle->h_transaction->t_journal); +} + +static inline void jbd2_journal_abort_handle(handle_t *handle) +{ + handle->h_aborted = 1; +} + +static inline void jbd2_init_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_mapping; + + /* + * Save the original wb_err value of client fs's bdev mapping which + * could be used to detect the client fs's metadata async write error. + */ + errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); +} + +static inline int jbd2_check_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_mapping; + + return errseq_check(&mapping->wb_err, + READ_ONCE(journal->j_fs_dev_wb_err)); +} + +#endif /* __KERNEL__ */ + +/* Comparison functions for transaction IDs: perform comparisons using + * modulo arithmetic so that they work over sequence number wraps. */ + +static inline int tid_gt(tid_t x, tid_t y) +{ + int difference = (x - y); + return (difference > 0); +} + +static inline int tid_geq(tid_t x, tid_t y) +{ + int difference = (x - y); + return (difference >= 0); +} + +extern int jbd2_journal_blocks_per_folio(struct inode *inode); +extern size_t journal_tag_bytes(journal_t *journal); + +static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) +{ + return jbd2_has_feature_csum2(journal) || + jbd2_has_feature_csum3(journal); +} + +static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) +{ + int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); + + return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; +} + +/* + * Return number of free blocks in the log. Must be called under j_state_lock. + */ +static inline unsigned long jbd2_log_space_left(journal_t *journal) +{ + /* Allow for rounding errors */ + long free = journal->j_free - 32; + + if (journal->j_committing_transaction) { + free -= atomic_read(&journal-> + j_committing_transaction->t_outstanding_credits); + } + return max_t(long, free, 0); +} + +/* + * Definitions which augment the buffer_head layer + */ + +/* journaling buffer types */ +#define BJ_None 0 /* Not journaled */ +#define BJ_Metadata 1 /* Normal journaled metadata */ +#define BJ_Forget 2 /* Buffer superseded by this transaction */ +#define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ +#define BJ_Reserved 4 /* Buffer is reserved for access by journal */ +#define BJ_Types 5 + +static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length) +{ + return crc32c(crc, address, length); +} + +/* Return most recent uncommitted transaction */ +static inline tid_t jbd2_get_latest_transaction(journal_t *journal) +{ + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_commit_request; + if (journal->j_running_transaction) + tid = journal->j_running_transaction->t_tid; + read_unlock(&journal->j_state_lock); + return tid; +} + +static inline int jbd2_handle_buffer_credits(handle_t *handle) +{ + journal_t *journal; + + if (!handle->h_reserved) + journal = handle->h_transaction->t_journal; + else + journal = handle->h_journal; + + return handle->h_total_credits - + DIV_ROUND_UP(handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); +} + +#ifdef __KERNEL__ + +#define buffer_trace_init(bh) do {} while (0) +#define print_buffer_fields(bh) do {} while (0) +#define print_buffer_trace(bh) do {} while (0) +#define BUFFER_TRACE(bh, info) do {} while (0) +#define BUFFER_TRACE2(bh, bh2, info) do {} while (0) +#define JBUFFER_TRACE(jh, info) do {} while (0) + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _LINUX_JBD2_H */ -- 2.43.0
participants (1)
-
Simon Glass