From: Simon Glass <simon.glass@canonical.com> Copy inode-test.c, ioctl.c, mballoc.h, and mballoc-test.c from Linux v6.18 fs/ext4 directory. - inode-test: KUnit tests for inode timestamp handling - ioctl: filesystem ioctl handlers - mballoc.h: multiblock allocator header - mballoc-test: KUnit tests for multiblock allocator Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> --- fs/ext4l/inode-test.c | 283 ++++++ fs/ext4l/ioctl.c | 2020 +++++++++++++++++++++++++++++++++++++++ fs/ext4l/mballoc-test.c | 999 +++++++++++++++++++ fs/ext4l/mballoc.h | 273 ++++++ 4 files changed, 3575 insertions(+) create mode 100644 fs/ext4l/inode-test.c create mode 100644 fs/ext4l/ioctl.c create mode 100644 fs/ext4l/mballoc-test.c create mode 100644 fs/ext4l/mballoc.h diff --git a/fs/ext4l/inode-test.c b/fs/ext4l/inode-test.c new file mode 100644 index 00000000000..749af7ad4e0 --- /dev/null +++ b/fs/ext4l/inode-test.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 inode that verify the seconds part of [a/c/m] + * timestamps in ext4 inode structs are decoded correctly. + */ + +#include <kunit/test.h> +#include <linux/kernel.h> +#include <linux/time64.h> + +#include "ext4.h" + +/* + * For constructing the nonnegative timestamp lower bound value. + * binary: 00000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_0 0L +/* + * For constructing the nonnegative timestamp upper bound value. + * binary: 01111111 11111111 11111111 11111111 + * + */ +#define UPPER_MSB_0 0x7fffffffL +/* + * For constructing the negative timestamp lower bound value. + * binary: 10000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ +/* + * For constructing the negative timestamp upper bound value. + * binary: 11111111 11111111 11111111 11111111 + */ +#define UPPER_MSB_1 (-1L) +/* + * Upper bound for nanoseconds value supported by the encoding. + * binary: 00111111 11111111 11111111 11111111 + */ +#define MAX_NANOSECONDS ((1L << 30) - 1) + +#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x" + +#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits" +#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits" +#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits" +#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits" +#define LOWER_BOUND_NEG_LO_1_CASE\ + "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NEG_LO_1_CASE\ + "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NONNEG_LO_1_CASE\ + "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NONNEG_LO_1_CASE\ + "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NEG_HI_1_CASE\ + "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NEG_HI_1_CASE\ + "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on" +#define LOWER_BOUND_NONNEG_HI_1_CASE\ + "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_CASE\ + "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\ + "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns" +#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\ + "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns" +#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on" +#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on" + +struct timestamp_expectation { + const char *test_case_name; + struct timespec64 expected; + u32 extra_bits; + bool msb_set; + bool lower_bound; +}; + +static const struct timestamp_expectation test_data[] = { + { + .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 0, + .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = -1LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0, + .expected = {0LL, 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 6, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0xFFFFFFFF, + .expected = {.tv_sec = 0x300000000LL, + .tv_nsec = MAX_NANOSECONDS}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 3, + .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 3, + .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L}, + } +}; + +static void timestamp_expectation_to_desc(const struct timestamp_expectation *t, + char *desc) +{ + strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc); + +static time64_t get_32bit_time(const struct timestamp_expectation * const test) +{ + if (test->msb_set) { + if (test->lower_bound) + return LOWER_MSB_1; + + return UPPER_MSB_1; + } + + if (test->lower_bound) + return LOWER_MSB_0; + return UPPER_MSB_0; +} + + +/* + * Test data is derived from the table in the Inode Timestamps section of + * Documentation/filesystems/ext4/inodes.rst. + */ +static void inode_test_xtimestamp_decoding(struct kunit *test) +{ + struct timespec64 timestamp; + + struct timestamp_expectation *test_param = + (struct timestamp_expectation *)(test->param_value); + + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); + + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_sec, + timestamp.tv_sec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_nsec, + timestamp.tv_nsec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); +} + +static struct kunit_case ext4_inode_test_cases[] = { + KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params), + {} +}; + +static struct kunit_suite ext4_inode_test_suite = { + .name = "ext4_inode_test", + .test_cases = ext4_inode_test_cases, +}; + +kunit_test_suites(&ext4_inode_test_suite); + +MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding"); +MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4l/ioctl.c b/fs/ext4l/ioctl.c new file mode 100644 index 00000000000..a93a7baae99 --- /dev/null +++ b/fs/ext4l/ioctl.c @@ -0,0 +1,2020 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/fs.h> +#include <linux/capability.h> +#include <linux/time.h> +#include <linux/compat.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/quotaops.h> +#include <linux/random.h> +#include <linux/uaccess.h> +#include <linux/delay.h> +#include <linux/iversion.h> +#include <linux/fileattr.h> +#include <linux/uuid.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include <trace/events/ext4.h> + +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); + +/* + * Superblock modification callback function for changing file system + * label + */ +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + /* Sanity check, this should never happen */ + BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); + + memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); +} + +/* + * Superblock modification callback function for changing file system + * UUID. + */ +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); +} + +static +int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, + ext4_update_sb_callback func, + const void *arg) +{ + int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = sbi->s_sbh; + struct ext4_super_block *es = sbi->s_es; + + trace_ext4_update_sb(sb, bh->b_blocknr, 1); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + + lock_buffer(bh); + func(sbi, es, arg); + ext4_superblock_csum_set(sb); + unlock_buffer(bh); + + if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_err; + err = sync_dirty_buffer(bh); +out_err: + ext4_std_error(sb, err); + return err; +} + +/* + * Update one backup superblock in the group 'grp' using the callback + * function 'func' and argument 'arg'. If the handle is NULL the + * modification is not journalled. + * + * Returns: 0 when no modification was done (no superblock in the group) + * 1 when the modification was successful + * <0 on error + */ +static int ext4_update_backup_sb(struct super_block *sb, + handle_t *handle, ext4_group_t grp, + ext4_update_sb_callback func, const void *arg) +{ + int err = 0; + ext4_fsblk_t sb_block; + struct buffer_head *bh; + unsigned long offset = 0; + struct ext4_super_block *es; + + if (!ext4_bg_has_super(sb, grp)) + return 0; + + /* + * For the group 0 there is always 1k padding, so we have + * either adjust offset, or sb_block depending on blocksize + */ + if (grp == 0) { + sb_block = 1 * EXT4_MIN_BLOCK_SIZE; + offset = do_div(sb_block, sb->s_blocksize); + } else { + sb_block = ext4_group_first_block_no(sb, grp); + offset = 0; + } + + trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); + + bh = ext4_sb_bread(sb, sb_block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_bh; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + lock_buffer(bh); + if (ext4_has_feature_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(es)) { + ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " + "superblock %llu", sb_block); + unlock_buffer(bh); + goto out_bh; + } + func(EXT4_SB(sb), es, arg); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if (handle) { + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_bh; + } else { + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + } + err = sync_dirty_buffer(bh); + +out_bh: + brelse(bh); + ext4_std_error(sb, err); + return (err) ? err : 1; +} + +/* + * Update primary and backup superblocks using the provided function + * func and argument arg. + * + * Only the primary superblock and at most two backup superblock + * modifications are journalled; the rest is modified without journal. + * This is safe because e2fsck will re-write them if there is a problem, + * and we're very unlikely to ever need more than two backups. + */ +static +int ext4_update_superblocks_fn(struct super_block *sb, + ext4_update_sb_callback func, + const void *arg) +{ + handle_t *handle; + ext4_group_t ngroups; + unsigned int three = 1; + unsigned int five = 5; + unsigned int seven = 7; + int err = 0, ret, i; + ext4_group_t grp, primary_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * We can't update superblocks while the online resize is running + */ + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, "Can't modify superblock while" + "performing online resize"); + return -EBUSY; + } + + /* + * We're only going to update primary superblock and two + * backup superblocks in this transaction. + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + /* Update primary superblock */ + err = ext4_update_primary_sb(sb, handle, func, arg); + if (err) { + ext4_msg(sb, KERN_ERR, "Failed to update primary " + "superblock"); + goto out_journal; + } + + primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); + ngroups = ext4_get_groups_count(sb); + + /* + * Update backup superblocks. We have to start from group 0 + * because it might not be where the primary superblock is + * if the fs is mounted with -o sb=<backup_sb_block> + */ + i = 0; + grp = 0; + while (grp < ngroups) { + /* Skip primary superblock */ + if (grp == primary_grp) + goto next_grp; + + ret = ext4_update_backup_sb(sb, handle, grp, func, arg); + if (ret < 0) { + /* Ignore bad checksum; try to update next sb */ + if (ret == -EFSBADCRC) + goto next_grp; + err = ret; + goto out_journal; + } + + i += ret; + if (handle && i > 1) { + /* + * We're only journalling primary superblock and + * two backup superblocks; the rest is not + * journalled. + */ + err = ext4_journal_stop(handle); + if (err) + goto out; + handle = NULL; + } +next_grp: + grp = ext4_list_backups(sb, &three, &five, &seven); + } + +out_journal: + if (handle) { + ret = ext4_journal_stop(handle); + if (ret && !err) + err = ret; + } +out: + clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); + smp_mb__after_atomic(); + return err ? err : 0; +} + +/* + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + swap(*ap, *bp); + ap++; + bp++; + } +} + +/* + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + unsigned long tmp; + struct timespec64 ts1, ts2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + swap(inode1->i_version, inode2->i_version); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); + swap(ei1->i_disksize, ei2->i_disksize); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +void ext4_reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_feature_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); +} + +/* + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @idmap: idmap of the mount the inode was found from + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct mnt_idmap *idmap, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei_bl; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); + ei_bl = EXT4_I(inode_bl); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + lock_two_nondirectories(inode, inode_bl); + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(idmap, inode) || + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + + /* Wait for all existing dio workers */ + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto err_out; + } + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_set_iversion(inode_bl, 1); + i_size_write(inode_bl, 0); + EXT4_I(inode_bl)->i_disksize = inode_bl->i_size; + inode_bl->i_mode = S_IFREG; + if (ext4_has_feature_extents(sb)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; + swap_inode_data(inode, inode_bl); + + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); + inode_inc_iversion(inode); + + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); + ext4_reset_inode_seed(inode); + ext4_reset_inode_seed(inode_bl); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + goto err_out1; + } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: + ext4_journal_stop(handle); + ext4_double_up_write_data_sem(inode, inode_bl); + +err_out: + filemap_invalidate_unlock(inode->i_mapping); +journal_err_out: + unlock_two_nondirectories(inode, inode_bl); + iput(inode_bl); + return err; +} + +/* + * If immutable is set and we are not clearing it, we're not allowed to change + * anything else in the inode. Don't error out if we're only trying to set + * immutable on an immutable file. + */ +static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int oldflags = ei->i_flags; + + if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) + return 0; + + if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) + return -EPERM; + if (ext4_has_feature_project(inode->i_sb) && + __kprojid_val(ei->i_projid) != new_projid) + return -EPERM; + + return 0; +} + +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + /* Allow the DAX flag to be changed on inline directories */ + if (S_ISDIR(inode->i_mode)) { + flags &= ~EXT4_INLINE_DATA_FL; + oldflags &= ~EXT4_INLINE_DATA_FL; + } + + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = -EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + struct super_block *sb = inode->i_sb; + + /* Is it quota file? Do not allow user to mess with it */ + if (ext4_is_quota_file(inode)) + goto flags_out; + + oldflags = ei->i_flags; + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { + if (!ext4_has_feature_casefold(sb)) { + err = -EOPNOTSUPP; + goto flags_out; + } + + if (!S_ISDIR(inode->i_mode)) { + err = -ENOTDIR; + goto flags_out; + } + + if (!ext4_empty_dir(inode)) { + err = -ENOTEMPTY; + goto flags_out; + } + } + + /* + * Wait for all pending directio and then flush all the dirty pages + * for this file. The flush marks all the pages readonly, so any + * subsequent attempt to write to the file (particularly mmap pages) + * will come through the filesystem and fail. + */ + if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && + (flags & EXT4_IMMUTABLE_FL)) { + inode_dio_wait(inode); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto flags_out; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + ext4_dax_dontcache(inode, flags); + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode, false); + + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + /* + * Changes to the journaling mode can cause unsafe changes to + * S_DAX if the inode is DAX + */ + if (IS_DAX(inode)) { + err = -EBUSY; + goto flags_out; + } + + err = ext4_change_inode_journal_flag(inode, + flags & EXT4_JOURNAL_DATA_FL); + if (err) + goto flags_out; + } + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) +{ + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct dquot *transfer_to[MAXQUOTAS] = { }; + + if (!ext4_has_feature_project(sb)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = -EPERM; + /* Is it quota file? Do not allow user to mess with it */ + if (ext4_is_quota_file(inode)) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = ext4_expand_extra_isize(inode, + EXT4_SB(sb)->s_want_extra_isize, + &iloc); + if (err) + return err; + } else { + brelse(iloc.bh); + } + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); + err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + EXT4_I(inode)->i_projid = kprojid; + inode_set_ctime_current(inode); + inode_inc_iversion(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); + return err; +} +#else +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +int ext4_force_shutdown(struct super_block *sb, u32 flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int ret; + + if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (ext4_forced_shutdown(sb)) + return 0; + + ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); + trace_ext4_shutdown(sb, flags); + + switch (flags) { + case EXT4_GOING_FLAGS_DEFAULT: + ret = bdev_freeze(sb->s_bdev); + if (ret) + return ret; + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + bdev_thaw(sb->s_bdev); + break; + case EXT4_GOING_FLAGS_LOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { + (void) ext4_force_commit(sb); + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); + } + break; + case EXT4_GOING_FLAGS_NOLOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); + break; + default: + return -EINVAL; + } + clear_opt(sb, DISCARD); + return 0; +} + +static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) +{ + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + return ext4_force_shutdown(sb, flags); +} + +struct getfsmap_info { + struct super_block *gi_sb; + struct fsmap_head __user *gi_data; + unsigned int gi_idx; + __u32 gi_last_flags; +}; + +static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_ext4_getfsmap_mapping(info->gi_sb, xfm); + + info->gi_last_flags = xfm->fmr_flags; + ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); + if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +static int ext4_ioc_getfsmap(struct super_block *sb, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = { NULL }; + struct ext4_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + /* + * ext4 doesn't report file extents at all, so the only valid + * file offsets are the magic ones (all zeroes or all ones). + */ + if (head.fmh_keys[0].fmr_offset || + (head.fmh_keys[1].fmr_offset != 0 && + head.fmh_keys[1].fmr_offset != -1ULL)) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); + trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); + + info.gi_sb = sb; + info.gi_data = arg; + error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); + if (error == EXT4_QUERY_RANGE_ABORT) + aborted = true; + else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.gi_idx) { + info.gi_last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, + &info.gi_last_flags, + sizeof(info.gi_last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + +static long ext4_ioctl_group_add(struct file *file, + struct ext4_new_group_data *input) +{ + struct super_block *sb = file_inode(file)->i_sb; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(file); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(file); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input->group); +group_add_out: + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; + return err; +} + +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct ext4_inode_info *ei = EXT4_I(inode); + u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + + if (S_ISREG(inode->i_mode)) + flags &= ~FS_PROJINHERIT_FL; + + fileattr_fill_flags(fa, flags); + if (ext4_has_feature_project(inode->i_sb)) + fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); + + return 0; +} + +int ext4_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 flags = fa->flags; + int err = -EOPNOTSUPP; + + if (flags & ~EXT4_FL_USER_VISIBLE) + goto out; + + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + goto out; + err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags); + if (err) + goto out; + err = ext4_ioctl_setflags(inode, flags); + if (err) + goto out; + err = ext4_ioctl_setproject(inode, fa->fsx_projid); +out: + return err; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + int error; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) +{ + int err = 0; + __u32 flags = 0; + unsigned int flush_flags = 0; + struct super_block *sb = file_inode(filp)->i_sb; + + if (copy_from_user(&flags, (__u32 __user *)arg, + sizeof(__u32))) + return -EFAULT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* check for invalid bits set */ + if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!EXT4_SB(sb)->s_journal) + return -ENODEV; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) + return -EOPNOTSUPP; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + return 0; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) + flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { + flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); + } + + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + + return err; +} + +static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) +{ + size_t len; + int ret = 0; + char new_label[EXT4_LABEL_MAX + 1]; + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Copy the maximum length allowed for ext4 label with one more to + * find the required terminating null byte in order to test the + * label length. The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) + return -EFAULT; + + len = strnlen(new_label, EXT4_LABEL_MAX + 1); + if (len > EXT4_LABEL_MAX) + return -EINVAL; + + /* + * Clear the buffer after the new label + */ + memset(new_label + len, 0, EXT4_LABEL_MAX - len); + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); + + mnt_drop_write_file(filp); + return ret; +} + +static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) +{ + char label[EXT4_LABEL_MAX + 1]; + + /* + * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because + * FSLABEL_MAX must include terminating null byte, while s_volume_name + * does not have to. + */ + BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); + + lock_buffer(sbi->s_sbh); + memtostr_pad(label, sbi->s_es->s_volume_name); + unlock_buffer(sbi->s_sbh); + + if (copy_to_user(user_label, label, sizeof(label))) + return -EFAULT; + return 0; +} + +static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, + struct fsuuid __user *ufsuuid) +{ + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len == 0) { + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, + sizeof(fsuuid.fsu_len))) + return -EFAULT; + return 0; + } + + if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + lock_buffer(sbi->s_sbh); + memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); + unlock_buffer(sbi->s_sbh); + + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || + copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + return -EFAULT; + return 0; +} + +static int ext4_ioctl_setuuid(struct file *filp, + const struct fsuuid __user *ufsuuid) +{ + int ret = 0; + struct super_block *sb = file_inode(filp)->i_sb; + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * If any checksums (group descriptors or metadata) are being used + * then the checksum seed feature is required to change the UUID. + */ + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) + && !ext4_has_feature_csum_seed(sb)) + || ext4_has_feature_stable_inodes(sb)) + return -EOPNOTSUPP; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid); + mnt_drop_write_file(filp); + + return ret; +} + + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + +static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct mnt_idmap *idmap = file_mnt_idmap(filp); + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case FS_IOC_GETFSMAP: + return ext4_ioc_getfsmap(sb, (void __user *)arg); + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(idmap, inode)) + return -EPERM; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); + +unlock_out: + inode_unlock(inode); +setversion_out: + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_extend_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_extend_out; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); +group_extend_out: + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; + return err; + } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + me.moved_len = 0; + + CLASS(fd, donor)(me.donor_fd); + if (fd_empty(donor)) + return -EBADF; + + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) + return -EBADF; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = ext4_move_extents(filp, fd_file(donor), me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) + err = -EFAULT; + return err; + } + + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + return ext4_ioctl_group_add(filp, &input); + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + inode_lock((inode)); + err = ext4_ext_migrate(inode); + inode_unlock((inode)); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_SWAP_BOOT: + { + int err; + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, idmap, inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write_file(filp); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); + if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + +resizefs_out: + err2 = ext4_resize_end(sb, true); + if (err == 0) + err = err2; + return err; + } + + case FITRIM: + { + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bdev_max_discard_sectors(sb->s_bdev)) + return -EOPNOTSUPP; + + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + case EXT4_IOC_PRECACHE_EXTENTS: + { + int ret; + + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } + case FS_IOC_SET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_PWSALT: + return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_add_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key_all_users(filp, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); + + case EXT4_IOC_CLEAR_ES_CACHE: + { + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + ext4_clear_inode_es(inode); + return 0; + } + + case EXT4_IOC_GETSTATE: + { + __u32 state = 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) + state |= EXT4_STATE_FLAG_EXT_PRECACHED; + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + state |= EXT4_STATE_FLAG_NEW; + if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + state |= EXT4_STATE_FLAG_NEWENTRY; + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) + state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; + + return put_user(state, (__u32 __user *) arg); + } + + case EXT4_IOC_GET_ES_CACHE: + return ext4_ioctl_get_es_cache(filp, arg); + + case EXT4_IOC_SHUTDOWN: + return ext4_ioctl_shutdown(sb, arg); + + case FS_IOC_ENABLE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_enable(filp, (const void __user *)arg); + + case FS_IOC_MEASURE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_measure(filp, (void __user *)arg); + + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + + case EXT4_IOC_CHECKPOINT: + return ext4_ioctl_checkpoint(filp, arg); + + case FS_IOC_GETFSLABEL: + return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); + + case FS_IOC_SETFSLABEL: + return ext4_ioctl_setlabel(filp, + (const void __user *)arg); + + case EXT4_IOC_GETFSUUID: + return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); + case EXT4_IOC_SETFSUUID: + return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); + default: + return -ENOTTY; + } +} + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + return __ext4_ioctl(filp, cmd, arg); +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_data input; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + return ext4_ioctl_group_add(file, &input); + } + case EXT4_IOC_MOVE_EXT: + case EXT4_IOC_RESIZE_FS: + case FITRIM: + case EXT4_IOC_PRECACHE_EXTENTS: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: + case EXT4_IOC_SHUTDOWN: + case FS_IOC_GETFSMAP: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: + case EXT4_IOC_CLEAR_ES_CACHE: + case EXT4_IOC_GETSTATE: + case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_CHECKPOINT: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + case EXT4_IOC_GETFSUUID: + case EXT4_IOC_SETFSUUID: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); +} + +int ext4_update_overhead(struct super_block *sb, bool force) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (ext4_emergency_state(sb) || sb_rdonly(sb)) + return 0; + if (!force && + (sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) + return 0; + return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); +} diff --git a/fs/ext4l/mballoc-test.c b/fs/ext4l/mballoc-test.c new file mode 100644 index 00000000000..a9416b20ff6 --- /dev/null +++ b/fs/ext4l/mballoc-test.c @@ -0,0 +1,999 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include <kunit/test.h> +#include <kunit/static_stub.h> +#include <linux/random.h> + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct ext4_super_block es; + struct ext4_sb_info sbi; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + +static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; + + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; + + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) + goto out; + + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + + sbi->s_es = &fsb->es; + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; + +out_deactivate: + deactivate_locked_super(sb); +out: + kfree(fsb); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = MBT_SB(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_mb_release(sb); + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4l/mballoc.h b/fs/ext4l/mballoc.h new file mode 100644 index 00000000000..15a049f05d0 --- /dev/null +++ b/fs/ext4l/mballoc.h @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas <alex@clusterfs.com> + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> +#include <linux/mutex.h> +#include "ext4_jbd2.h" +#include "ext4.h" + +/* + * mb_debug() dynamic printk msgs could be used to debug mballoc code. + */ +#ifdef CONFIG_EXT4_DEBUG +#define mb_debug(sb, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ + current->comm, task_pid_nr(current), sb->s_id, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) +#else +#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * with 's_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 0 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4/<partition>/stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + +/* + * Number of groups to search linearly before performing group scanning + * optimization. + */ +#define MB_DEFAULT_LINEAR_LIMIT 4 + +/* + * Minimum number of groups that should be present in the file system to perform + * group scanning optimizations. + */ +#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 + +/* + * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular + * allocation request. Example, if we have an order 7 request and max trim order + * of 3, we can trim this request upto order 4. + */ +#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 + +/* + * Number of valid buddy orders + */ +#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) + +struct ext4_free_data { + /* this links the free block information from sb_info */ + struct list_head efd_list; + + /* this links the free block information from group_info */ + struct rb_node efd_node; + + /* group which free block extent belongs */ + ext4_group_t efd_group; + + /* free block extent */ + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; + + /* transaction which freed this extent */ + tid_t efd_tid; +}; + +struct ext4_prealloc_space { + union { + struct rb_node inode_node; /* for inode PA rbtree */ + struct list_head lg_list; /* for lg PAs */ + } pa_node; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + union { + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ + } pa_node_lock; + struct inode *pa_inode; /* used to get the inode during group discard */ +}; + +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; /* In cluster units */ + ext4_group_t fe_group; + ext4_grpblk_t fe_len; /* In cluster units */ +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (normalized ac_o_ex) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. + */ + ext4_grpblk_t ac_orig_goal_len; + + ext4_group_t ac_prefetch_grp; + unsigned int ac_prefetch_ios; + unsigned int ac_prefetch_nr; + + int ac_first_err; + + __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_cX_found[EXT4_MB_NUM_CRS]; + __u16 ac_tail; + __u16 ac_buddy; + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + + struct ext4_buddy *ac_e4b; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_buddy { + struct folio *bd_buddy_folio; + void *bd_buddy; + struct folio *bd_bitmap_folio; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); +} + +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + +typedef int (*ext4_mballoc_query_range_fn)( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t len, + void *priv); + +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, + ext4_mballoc_query_range_fn formatter, + void *priv); + +#endif -- 2.43.0