From: Simon Glass <simon.glass@canonical.com> Copy acl.c, acl.h, balloc.c, bitmap.c, block_validity.c, crypto.c, and dir.c from Linux v6.18 fs/ext4 directory. - acl: POSIX ACL support - balloc: block allocation helpers - bitmap: bitmap manipulation utilities - block_validity: block validity checking - crypto: filesystem encryption support - dir: directory entry handling Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/acl.c | 304 +++++++++++ fs/ext4l/acl.h | 74 +++ fs/ext4l/balloc.c | 1003 +++++++++++++++++++++++++++++++++++++ fs/ext4l/bitmap.c | 99 ++++ fs/ext4l/block_validity.c | 370 ++++++++++++++ fs/ext4l/crypto.c | 241 +++++++++ fs/ext4l/dir.c | 693 +++++++++++++++++++++++++ 7 files changed, 2784 insertions(+) create mode 100644 fs/ext4l/acl.c create mode 100644 fs/ext4l/acl.h create mode 100644 fs/ext4l/balloc.c create mode 100644 fs/ext4l/bitmap.c create mode 100644 fs/ext4l/block_validity.c create mode 100644 fs/ext4l/crypto.c create mode 100644 fs/ext4l/dir.c diff --git a/fs/ext4l/acl.c b/fs/ext4l/acl.c new file mode 100644 index 00000000000..3bffe862f95 --- /dev/null +++ b/fs/ext4l/acl.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + */ + +#include <linux/quotaops.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext4_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_rwsem: don't care + */ +struct posix_acl * +ext4_get_acl(struct inode *inode, int type, bool rcu) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_rwsem: down unless called from ext4_new_inode + */ +static int +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl, int xattr_flags) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, xattr_flags); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + handle_t *handle; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + struct inode *inode = d_inode(dentry); + umode_t mode = inode->i_mode; + int update_mode = 0; + + error = dquot_initialize(inode); + if (error) + return error; +retry: + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(idmap, inode, &mode, &acl); + if (error) + goto out_stop; + if (mode != inode->i_mode) + update_mode = 1; + } + + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); + if (!error && update_mode) { + inode->i_mode = mode; + inode_set_ctime_current(inode); + error = ext4_mark_inode_dirty(handle, inode); + } +out_stop: + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_rwsem: down + * inode->i_rwsem: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl, XATTR_CREATE); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl, XATTR_CREATE); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + return error; +} diff --git a/fs/ext4l/acl.h b/fs/ext4l/acl.h new file mode 100644 index 00000000000..0c5a79c3b5d --- /dev/null +++ b/fs/ext4l/acl.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/posix_acl_xattr.h> + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* acl.c */ +struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); +int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include <linux/sched.h> +#define ext4_get_acl NULL +#define ext4_set_acl NULL + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/fs/ext4l/balloc.c b/fs/ext4l/balloc.c new file mode 100644 index 00000000000..c9329ed5c09 --- /dev/null +++ b/fs/ext4l/balloc.c @@ -0,0 +1,1003 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "mballoc.h" + +#include <trace/events/ext4.h> +#include <kunit/static_stub.h> + +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (block - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; +} + +/* + * Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned base_clusters, num_clusters; + int block_cluster = -1, inode_cluster; + int itbl_cluster_start = -1, itbl_cluster_end = -1; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; + ext4_fsblk_t itbl_blk_start, itbl_blk_end; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + base_clusters = ext4_num_base_meta_clusters(sb, block_group); + num_clusters = base_clusters; + + /* + * Account and record inode table clusters if any cluster + * is in the block group, or inode table cluster range is + * [-1, -1] and won't overlap with block/inode bitmap cluster + * accounted below. + */ + itbl_blk_start = ext4_inode_table(sb, gdp); + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; + if (itbl_blk_start <= end && itbl_blk_end >= start) { + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); + + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); + + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; + /* check if border cluster is overlapped */ + if (itbl_cluster_start == base_clusters - 1) + num_clusters--; + } + + /* + * For the allocation bitmaps, we first need to check to see + * if the block is in the block group. If it is, then check + * to see if the cluster is already accounted for in the clusters + * used for the base metadata cluster and inode tables cluster. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); + if (block_cluster >= base_clusters && + (block_cluster < itbl_cluster_start || + block_cluster > itbl_cluster_end)) + num_clusters++; + } + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + ext4_inode_bitmap(sb, gdp) - start); + /* + * Additional check if inode bitmap is in just accounted + * block_cluster + */ + if (inode_cluster != block_cluster && + inode_cluster >= base_clusters && + (inode_cluster < itbl_cluster_start || + inode_cluster > itbl_cluster_end)) + num_clusters++; + } + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { + /* + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. + */ + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} + +/* Initializes an uninitialized block bitmap */ +static int ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + + ASSERT(buffer_locked(bh)); + + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + memset(bh->b_data, 0, sb->s_blocksize); + + bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + } + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); + return 0; +} + +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned int group_desc; + unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh_p; + + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); + + return NULL; + } + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); + /* + * sbi_array_rcu_deref returns with rcu unlocked, this is ok since + * the pointer being dereferenced won't be dereferenced again. By + * looking at the usage in add_new_gdb() the value isn't modified, + * just the pointer, and so it remains valid. + */ + if (!bh_p) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)bh_p->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = bh_p; + return desc; +} + +static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + + if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) + return NULL; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. + */ +static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_fsblk_t blk; + ext4_fsblk_t group_first_block; + + if (ext4_has_feature_flex_bg(sb)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 0; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + blk = ext4_block_bitmap(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode bitmap block number is set */ + blk = ext4_inode_bitmap(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode table block number is set */ + blk = ext4_inode_table(sb, desc); + offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) + return blk; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) + /* bad bitmap for inode tables */ + return blk; + return 0; +} + +static int ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + grp = ext4_get_group_info(sb, block_group); + + if (buffer_verified(bh)) + return 0; + if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; + } + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } + set_buffer_verified(bh); +verified: + ext4_unlock_group(sb, block_group); + return 0; +} + +/** + * ext4_read_block_bitmap_nowait() + * @sb: super block + * @block_group: given block group + * @ignore_locked: ignore locked buffers + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or an ERR_PTR in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh; + ext4_fsblk_t bitmap_blk; + int err; + + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return ERR_PTR(-EFSCORRUPTED); + bitmap_blk = ext4_block_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid block bitmap block %llu in " + "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return ERR_PTR(-EFSCORRUPTED); + } + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_warning(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return ERR_PTR(-ENOMEM); + } + + if (ignore_locked && buffer_locked(bh)) { + /* buffer under IO already, return if called for prefetching */ + put_bh(bh); + return NULL; + } + + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + ext4_lock_group(sb, block_group); + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Block bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } + err = ext4_init_block_bitmap(sb, bh, block_group, desc); + if (err) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Failed to init block bitmap for group " + "%u: %d", block_group, err); + goto out; + } + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + set_buffer_new(bh); + trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | + (ignore_locked ? REQ_RAHEAD : 0), + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); + return bh; +verify: + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: + put_bh(bh); + return ERR_PTR(err); +} + +/* Returns 0 on success, -errno on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return -EFSCORRUPTED; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext4_error_err(sb, EIO, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EIO; + } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ + return ext4_validate_block_bitmap(sb, desc, block_group, bh); +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + int err; + + bh = ext4_read_block_bitmap_nowait(sb, block_group, false); + if (IS_ERR(bh)) + return bh; + err = ext4_wait_block_bitmap(sb, block_group, bh); + if (err) { + put_bh(bh); + return ERR_PTR(err); + } + return bh; +} + +/** + * ext4_has_free_clusters() + * @sbi: in-core super block structure. + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() + * + * Check if filesystem has nclusters free & available for allocation. + * On success return 1, return 0 on failure. + */ +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + s64 free_clusters, dirty_clusters, rsv, resv_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; + + if (free_clusters - (nclusters + rsv + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = percpu_counter_sum_positive(fcc); + dirty_clusters = percpu_counter_sum_positive(dcc); + } + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. + */ + if (free_clusters >= (rsv + nclusters + dirty_clusters)) + return 1; + + /* Hm, nope. Are (enough) root reserved clusters available? */ + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { + + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { + if (free_clusters >= (nclusters + dirty_clusters)) + return 1; + } + + return 0; +} + +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() - check if a block allocation should be retried + * @sb: superblock + * @retries: number of retry attempts made so far + * + * ext4_should_retry_alloc() is called when ENOSPC is returned while + * attempting to allocate blocks. If there's an indication that a pending + * journal transaction might free some space and allow another attempt to + * succeed, this function will wait for the current or committing transaction + * to complete and then return TRUE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!sbi->s_journal) + return 0; + + if (++(*retries) > 3) { + percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); + return 0; + } + + /* + * if there's no indication that blocks are about to be freed it's + * possible we just missed a transaction commit that did so + */ + smp_mb(); + if (atomic_read(&sbi->s_mb_free_pending) == 0) { + if (test_opt(sb, DISCARD)) { + atomic_inc(&sbi->s_retry_alloc_pending); + flush_work(&sbi->s_discard_work); + atomic_dec(&sbi->s_retry_alloc_pending); + } + return ext4_has_free_clusters(sbi, 1, 0); + } + + /* + * it's possible we've just missed a transaction commit here, + * so ignore the returned status + */ + ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); + (void) jbd2_journal_force_commit_nested(sbi->s_journal); + return 1; +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: pointer to total number of clusters needed + * @errp: error code + * + * Return 1st allocated block number on success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + ar.flags = flags; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); + } + return ret; +} + +/** + * ext4_count_free_clusters() -- count filesystem free clusters + * @sb: superblock + * + * Adds up the number of free clusters from each block group. + */ +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_info *grp; +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned int x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; + continue; + } + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_CLUSTERS_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_group_clusters(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (ext4_has_feature_sparse_super2(sb)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; + return 0; + } + if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) + return 1; + if (!(group & 1)) + return 0; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (ext4_has_feature_meta_bg(sb)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!ext4_has_feature_meta_bg(sb) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb_nometa(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb_meta(sb, block_group); + } + return num; +} + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (task_pid_nr(current) % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (task_pid_nr(current) % 16) * + ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4l/bitmap.c b/fs/ext4l/bitmap.c new file mode 100644 index 00000000000..87760fabdd2 --- /dev/null +++ b/fs/ext4l/bitmap.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/buffer_head.h> +#include "ext4.h" + +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); +} + +int ext4_inode_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; + + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + sz = EXT4_INODES_PER_GROUP(sb) >> 3; + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; + + if (!ext4_has_feature_metadata_csum(sb)) + return; + + sz = EXT4_INODES_PER_GROUP(sb) >> 3; + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} + +int ext4_block_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + + if (!ext4_has_feature_metadata_csum(sb)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!ext4_has_feature_metadata_csum(sb)) + return; + + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4l/block_validity.c b/fs/ext4l/block_validity.c new file mode 100644 index 00000000000..e8c5525afc6 --- /dev/null +++ b/fs/ext4l/block_validity.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; + u32 ino; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init ext4_init_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_system_zone(void) +{ + rcu_barrier(); + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) + return 1; + return 0; +} + +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count, u32 ino) +{ + struct ext4_system_zone *new_entry, *entry; + struct rb_node **n = &system_blks->root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; + } + + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_entry->ino = ino; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root); + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &system_blks->root); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &system_blks->root); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; + int first = 1; + + printk(KERN_INFO "System zones: "); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->s_system_blks); + node = rb_first(&system_blks->root); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + rcu_read_unlock(); + printk(KERN_CONT "\n"); +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) +{ + struct inode *inode; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_map_blocks map; + u32 i = 0, num; + int err = 0, n; + + if ((ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) + return -EINVAL; + inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + while (i < num) { + cond_resched(); + map.m_lblk = i; + map.m_len = num - i; + n = ext4_map_blocks(NULL, inode, &map, 0); + if (n < 0) { + err = n; + break; + } + if (n == 0) { + i++; + } else { + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + EXT4_ERROR_INODE_ERR(inode, -err, + "blocks %llu-%llu from inode overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1); + } + break; + } + i += n; + } + } + iput(inode); + return err; +} + +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_inode_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; + struct ext4_group_desc *gdp; + ext4_group_t i; + int ret; + + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + + for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + + cond_resched(); + if (meta_blks != 0) { + ret = add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), + meta_blks, 0); + if (ret) + goto err; + } + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1, 0); + if (ret) + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1, 0); + if (ret) + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), + sbi->s_itb_per_group, 0); + if (ret) + goto err; + } + if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { + ret = ext4_protect_reserved_inode(sb, system_blks, + le32_to_cpu(sbi->s_es->s_journal_inum)); + if (ret) + goto err; + } + + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_inode_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->s_system_blks, system_blks); + + if (test_opt(sb, DEBUG)) + debug_print_tree(sbi); + return 0; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; +} + +/* + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_inode_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. + */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct ext4_system_blocks *system_blks; + + system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); + + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); +} + +int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, + ext4_fsblk_t start_blk, unsigned int count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; + struct ext4_system_zone *entry; + struct rb_node *n; + int ret = 1; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; + + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->s_system_blks); + if (system_blks == NULL) + goto out_rcu; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + ret = 0; + if (inode) + ret = (entry->ino == inode->i_ino); + break; + } + } +out_rcu: + rcu_read_unlock(); + return ret; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with some other filesystem metadata blocks. + */ +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, + unsigned int count) +{ + return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count); +} + +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + __le32 *bref = p; + unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) + return 0; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EFSCORRUPTED; + } + } + return 0; +} + diff --git a/fs/ext4l/crypto.c b/fs/ext4l/crypto.c new file mode 100644 index 00000000000..cf0a0970c09 --- /dev/null +++ b/fs/ext4l/crypto.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/quotaops.h> +#include <linux/uuid.h> + +#include "ext4.h" +#include "xattr.h" +#include "ext4_jbd2.h" + +static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + + err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); + + return err; +} + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); + return err; +} + +void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + + ext4_fname_free_ci_filename(fname); +} + +static bool uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return false; + return true; +} + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, err2; + handle_t *handle; + + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto pwsalt_err_journal; + lock_buffer(sbi->s_sbh); + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; +pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + + if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) + return -EFAULT; + return 0; +} + +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle = fs_data; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; + + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + + res = ext4_convert_inline_data(inode); + if (res) + return res; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + } + return res; + } + + res = dquot_initialize(inode); + if (res) + return res; +retry: + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (!res) + res = res2; + return res; +} + +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) +{ + return EXT4_SB(sb)->s_dummy_enc_policy.policy; +} + +static bool ext4_has_stable_inodes(struct super_block *sb) +{ + return ext4_has_feature_stable_inodes(sb); +} + +const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", + .get_context = ext4_get_context, + .set_context = ext4_set_context, + .get_dummy_policy = ext4_get_dummy_policy, + .empty_dir = ext4_empty_dir, + .has_stable_inodes = ext4_has_stable_inodes, +}; diff --git a/fs/ext4l/dir.c b/fs/ext4l/dir.c new file mode 100644 index 00000000000..d4164c507a9 --- /dev/null +++ b/fs/ext4l/dir.c @@ -0,0 +1,693 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/iversion.h> +#include <linux/unicode.h> +#include "ext4.h" +#include "xattr.h" + +static int ext4_dx_readdir(struct file *, struct dir_context *); + +/** + * is_dx_dir() - check if a directory is using htree indexing + * @inode: directory inode + * + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (ext4_has_feature_dir_index(inode->i_sb) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) + return 1; + + return 0; +} + +static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) +{ + /* Check if . or .. , or skip if namelen is 0 */ + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && + (de->name[1] == '.' || de->name[1] == '\0')) + return true; + /* Check if this is a csum entry */ + if (de->file_type == EXT4_FT_DIR_CSUM) + return true; + return false; +} + +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. + */ +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, struct file *filp, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, char *buf, int size, + unsigned int offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; + bool fake = is_fake_dir_entry(de); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); + + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, + fake ? NULL : dir))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(next_offset > size)) + error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, + has_csum ? NULL : dir) && + next_offset != size)) + error_msg = "directory entry too close to block end"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; + else + return 0; + + if (filp) + ext4_error_file(filp, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, size=%d fake=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, size, fake); + else + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, size=%d fake=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, size, fake); + + return 1; +} + +static int ext4_readdir(struct file *file, struct dir_context *ctx) +{ + unsigned int offset; + int i; + struct ext4_dir_entry_2 *de; + int err; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; + + err = fscrypt_prepare_readdir(inode); + if (err) + return err; + + if (is_dx_dir(inode)) { + err = ext4_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; + + /* Can we just clear INDEX flag to ignore htree information? */ + if (!ext4_has_feature_metadata_csum(sb)) { + /* + * We don't set the inode dirty flag since it's not + * critical that it gets flushed back to the disk. + */ + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } + } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + err = ext4_read_inline_dir(file, ctx, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (IS_ENCRYPTED(inode)) { + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); + if (err < 0) + return err; + } + + while (ctx->pos < inode->i_size) { + struct ext4_map_blocks map; + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); + offset = ctx->pos & (sb->s_blocksize - 1); + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err == 0) { + /* m_len should never be zero but let's avoid + * an infinite loop if it somehow is */ + if (map.m_len == 0) + map.m_len = 1; + ctx->pos += map.m_len * sb->s_blocksize; + continue; + } + if (err > 0) { + pgoff_t index = map.m_pblk >> + (PAGE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&file->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_mapping, + &file->f_ra, file, + index, 1); + file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } + } + + if (!bh) { + /* corrupt size? Maybe no more blocks to read */ + if (ctx->pos > inode->i_blocks << 9) + break; + ctx->pos += sb->s_blocksize - offset; + continue; + } + + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirblock_csum_verify(inode, bh)) { + EXT4_ERROR_FILE(file, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; + brelse(bh); + bh = NULL; + continue; + } + set_buffer_verified(bh); + + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (!inode_eq_iversion(inode, info->cookie)) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < ext4_dir_rec_len(1, + inode)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = i; + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) + | offset; + info->cookie = inode_query_iversion(inode); + } + + while (ctx->pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (ext4_check_dir_entry(inode, file, de, bh, + bh->b_data, bh->b_size, + offset)) { + /* + * On error, skip to the next block + */ + ctx->pos = (ctx->pos | + (sb->s_blocksize - 1)) + 1; + break; + } + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + if (le32_to_cpu(de->inode)) { + if (!IS_ENCRYPTED(inode)) { + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); + u32 hash; + u32 minor_hash; + + if (IS_CASEFOLDED(inode)) { + hash = EXT4_DIRENT_HASH(de); + minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hash = 0; + minor_hash = 0; + } + + /* Directory is encrypted */ + err = fscrypt_fname_disk_to_usr(inode, + hash, minor_hash, &de_name, &fstr); + de_name = fstr; + fstr.len = save_len; + if (err) + goto errout; + if (!dir_emit(ctx, + de_name.name, de_name.len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) + goto done; + brelse(bh); + bh = NULL; + } +done: + err = 0; +errout: + fscrypt_fname_free_buffer(&fstr); + brelse(bh); + return err; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return in_compat_syscall(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() calls generic_file_llseek_size to handle htree + * directories, where the "offset" is in terms of the filename hash + * value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond offset limits, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * For non-htree, ext4_llseek already chooses the proper max offset. + */ +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; + int dx_dir = is_dx_dir(inode); + loff_t ret, htree_max = ext4_get_htree_eof(file); + + if (likely(dx_dir)) + ret = generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + ret = ext4_llseek(file, offset, whence); + info->cookie = inode_peek_iversion(inode) - 1; + return ret; +} + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[] __counted_by(name_len); +}; + +/* + * This function implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + + *root = RB_ROOT; +} + +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) +{ + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, ent_name->name, ent_name->len); + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entries on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + + if (!fname) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); + return 0; + } + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); + while (fname) { + if (!dir_emit(ctx, fname->name, + fname->name_len, + fname->inode, + get_dtype(sb, fname->file_type))) { + info->extra_fname = fname; + return 1; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct fname *fname; + int ret = 0; + + ext4_htree_init_dir_info(file, ctx->pos); + + if (ctx->pos == ext4_get_htree_eof(file)) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != ctx->pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(file, ctx, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + !inode_eq_iversion(inode, info->cookie)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + info->cookie = inode_query_iversion(inode); + ret = ext4_htree_fill_tree(file, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + goto finished; + if (ret == 0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(file, ctx, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = ctx->pos; + return ret < 0 ? ret : 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int rlen; + unsigned int offset = 0; + char *top; + + de = buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EFSCORRUPTED; + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EFSCORRUPTED; + + return 0; +} + +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + +const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .iterate_shared = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; -- 2.43.0