[PATCH 00/15] ext4l: Add jbd2 and remaining ext4l files to build (part F)
From: Simon Glass <simon.glass@canonical.com> This series continues the ext4l port by adding the jbd2 journaling layer and remaining ext4l files to the build. Changes: - Fix pr_emerg() macro to use log_emer() - Import jbd2 source files from Linux 6.18 - Add jbd2 files to build: checkpoint, commit, journal, recovery, revoke, transaction - Add ext4l files to build: migrate, mmp, move_extent, resize, fsmap Each file addition includes the necessary stubs in ext4_uboot.h and removes redundant stubs from stub.c as real implementations become available. Simon Glass (15): printk: Fix pr_emerg to use log_emer jbd2: Add checkpoint.c and revoke.c from Linux jbd2: Add recovery.c and commit.c from Linux jbd2: Add transaction.c from Linux jbd2: Add journal.c from Linux jbd2: Add checkpoint.c to the build jbd2: Add journal.c to the build jbd2: Add recovery.c to the build jbd2: Add revoke.c to the build jbd2: Add transaction.c to the build ext4l: Add migrate.c to the build ext4l: Add mmp.c to the build ext4l: Add move_extent.c to the build ext4l: Add resize.c to the build ext4l: Add fsmap.c to the build fs/Makefile | 1 + fs/ext4l/Makefile | 6 +- fs/ext4l/ext4_uboot.h | 258 +++- fs/ext4l/fsmap.c | 5 +- fs/ext4l/migrate.c | 2 +- fs/ext4l/mmp.c | 6 +- fs/ext4l/move_extent.c | 5 +- fs/ext4l/resize.c | 6 +- fs/ext4l/stub.c | 314 ++-- fs/jbd2/Makefile | 6 + fs/jbd2/checkpoint.c | 713 +++++++++ fs/jbd2/commit.c | 1147 +++++++++++++++ fs/jbd2/journal.c | 3136 ++++++++++++++++++++++++++++++++++++++++ fs/jbd2/recovery.c | 987 +++++++++++++ fs/jbd2/revoke.c | 731 ++++++++++ fs/jbd2/transaction.c | 2738 +++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + include/linux/printk.h | 2 +- 18 files changed, 9813 insertions(+), 251 deletions(-) create mode 100644 fs/jbd2/Makefile create mode 100644 fs/jbd2/checkpoint.c create mode 100644 fs/jbd2/commit.c create mode 100644 fs/jbd2/journal.c create mode 100644 fs/jbd2/recovery.c create mode 100644 fs/jbd2/revoke.c create mode 100644 fs/jbd2/transaction.c -- 2.43.0 base-commit: 2e36fa45effa46ee965af41e99bf50ecd4636976 branch: extf
From: Simon Glass <simon.glass@canonical.com> The pr_emerg macro was calling log_emerg which doesn't exist. The correct function name is log_emer, matching the LOGL_EMERG log level. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- include/linux/printk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index e28cef0ac31..edf149f52c7 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -35,7 +35,7 @@ #define pr_emerg(fmt, ...) \ ({ \ - CONFIG_LOGLEVEL > 0 ? log_emerg(fmt, ##__VA_ARGS__) : 0; \ + CONFIG_LOGLEVEL > 0 ? log_emer(fmt, ##__VA_ARGS__) : 0; \ }) #define pr_alert(fmt, ...) \ ({ \ -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add the JBD2 journaling layer checkpoint and revoke functionality from the Linux 6.18 kernel ext4 filesystem driver. checkpoint.c handles: - Log space management and checkpointing - Buffer flushing and cleanup - Transaction checkpoint processing revoke.c handles: - Block revocation for journal recovery - Revoke record management - Hash table for revoked blocks These files are part of the JBD2 (Journaling Block Device 2) layer that provides transaction support for ext4. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/jbd2/checkpoint.c | 718 +++++++++++++++++++++++++++++++++++++++++ fs/jbd2/revoke.c | 743 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1461 insertions(+) create mode 100644 fs/jbd2/checkpoint.c create mode 100644 fs/jbd2/revoke.c diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c new file mode 100644 index 00000000000..2d0719bf6d8 --- /dev/null +++ b/fs/jbd2/checkpoint.c @@ -0,0 +1,718 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/checkpoint.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd2.h> + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * __jbd2_log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __jbd2_log_wait_for_space(journal_t *journal) +__acquires(&journal->j_state_lock) +__releases(&journal->j_state_lock) +{ + int nblocks, space_left; + /* assert_spin_locked(&journal->j_state_lock); */ + + nblocks = journal->j_max_transaction_buffers; + while (jbd2_log_space_left(journal) < nblocks) { + write_unlock(&journal->j_state_lock); + mutex_lock_io(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + mutex_unlock(&journal->j_checkpoint_mutex); + return; + } + spin_lock(&journal->j_list_lock); + space_left = jbd2_log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + bool has_transaction = false; + + if (journal->j_committing_transaction) { + tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + if (chkpt) { + jbd2_log_do_checkpoint(journal); + } else if (jbd2_cleanup_journal_tail(journal) <= 0) { + /* + * We were able to recover space or the + * journal was aborted due to an error. + */ + ; + } else if (has_transaction) { + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set. So we need to temporarily drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); + jbd2_log_wait_commit(journal, tid); + write_lock(&journal->j_state_lock); + continue; + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space in %s\n", __func__, + journal->j_devname); + WARN_ON(1); + jbd2_journal_abort(journal, -EIO); + } + write_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +static void +__flush_batch(journal_t *journal, int *batch_count) +{ + int i; + struct blk_plug plug; + + blk_start_plug(&plug); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS); + blk_finish_plug(&plug); + + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = journal->j_chkpt_bhs[i]; + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + journal->j_chkpt_bhs[i] = NULL; + } + *batch_count = 0; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. + */ +int jbd2_log_do_checkpoint(journal_t *journal) +{ + struct journal_head *jh; + struct buffer_head *bh; + transaction_t *transaction; + tid_t this_tid; + int result, batch_count = 0; + + jbd2_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = jbd2_cleanup_journal_tail(journal); + trace_jbd2_checkpoint(journal, result); + jbd2_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + if (transaction->t_chp_stats.cs_chp_time == 0) + transaction->t_chp_stats.cs_chp_time = jiffies; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + /* checkpoint all of the transaction's buffers */ + while (transaction->t_checkpoint_list) { + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so + * starting and waiting for a commit + * to finish will cause us to wait for + * a _very_ long time. + */ + printk(KERN_ERR + "JBD2: %s: Waiting for Godot: block %llu\n", + journal->j_devname, (unsigned long long) bh->b_blocknr); + + if (batch_count) + __flush_batch(journal, &batch_count); + jbd2_log_start_commit(journal, tid); + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set, jbd2_update_log_tail() called by + * jbd2_journal_commit_transaction() may also take + * checkpoint_mutex. So we need to temporarily + * drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); + jbd2_log_wait_commit(journal, tid); + mutex_lock_io(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (!trylock_buffer(bh)) { + /* + * The buffer is locked, it may be writing back, or + * flushing out in the last couple of cycles, or + * re-adding into a new transaction, need to check + * it again until it's unlocked. + */ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; + } else if (!buffer_dirty(bh)) { + unlock_buffer(bh); + BUFFER_TRACE(bh, "remove from checkpoint"); + /* + * If the transaction was released or the checkpoint + * list was empty, we're done. + */ + if (__jbd2_journal_remove_checkpoint(jh) || + !transaction->t_checkpoint_list) + goto out; + } else { + unlock_buffer(bh); + /* + * We are about to write the buffer, it could be + * raced by some other transaction shrink or buffer + * re-log logic once we release the j_list_lock, + * leave it on the checkpoint list and check status + * again to make sure it's clean. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + transaction->t_chp_stats.cs_written++; + transaction->t_checkpoint_list = jh->b_cpnext; + } + + if ((batch_count == JBD2_NR_BATCH) || + need_resched() || spin_needbreak(&journal->j_list_lock) || + jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) + goto unlock_and_flush; + } + + if (batch_count) { + unlock_and_flush: + spin_unlock(&journal->j_list_lock); + retry: + if (batch_count) + __flush_batch(journal, &batch_count); + cond_resched(); + spin_lock(&journal->j_list_lock); + goto restart; + } + +out: + spin_unlock(&journal->j_list_lock); + result = jbd2_cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. + */ + +int jbd2_cleanup_journal_tail(journal_t *journal) +{ + tid_t first_tid; + unsigned long blocknr; + + if (is_journal_aborted(journal)) + return -EIO; + + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) + return 1; + J_ASSERT(blocknr != 0); + + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. + */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev); + + return __jbd2_update_log_tail(journal, first_tid, blocknr); +} + + +/* Checkpoint list management */ + +/* + * journal_shrink_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list + * and try to release them. If the whole transaction is released, set + * the 'released' parameter. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, + enum jbd2_shrink_type type, + bool *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + unsigned long nr_freed = 0; + int ret; + + *released = false; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + + if (type == JBD2_SHRINK_DESTROY) { + ret = __jbd2_journal_remove_checkpoint(jh); + } else { + ret = jbd2_journal_try_remove_checkpoint(jh); + if (ret < 0) { + if (type == JBD2_SHRINK_BUSY_SKIP) + continue; + break; + } + } + + nr_freed++; + if (ret) { + *released = true; + break; + } + + if (need_resched()) + break; + } while (jh != last_jh); + + return nr_freed; +} + +/* + * jbd2_journal_shrink_checkpoint_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the journal + * and try to release them. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, + unsigned long *nr_to_scan) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool __maybe_unused released; + tid_t first_tid = 0, last_tid = 0, next_tid = 0; + tid_t tid = 0; + unsigned long nr_freed = 0; + unsigned long freed; + bool first_set = false; + +again: + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + goto out; + } + + /* + * Get next shrink transaction, resume previous scan or start + * over again. If some others do checkpoint and drop transaction + * from the checkpoint list, we ignore saved j_shrink_transaction + * and start over unconditionally. + */ + if (journal->j_shrink_transaction) + transaction = journal->j_shrink_transaction; + else + transaction = journal->j_checkpoint_transactions; + + if (!first_set) { + first_tid = transaction->t_tid; + first_set = true; + } + last_transaction = journal->j_checkpoint_transactions->t_cpprev; + next_transaction = transaction; + last_tid = last_transaction->t_tid; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + tid = transaction->t_tid; + + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, + JBD2_SHRINK_BUSY_SKIP, &released); + nr_freed += freed; + (*nr_to_scan) -= min(*nr_to_scan, freed); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + } while (transaction != last_transaction); + + if (transaction != last_transaction) { + journal->j_shrink_transaction = next_transaction; + next_tid = next_transaction->t_tid; + } else { + journal->j_shrink_transaction = NULL; + next_tid = 0; + } + + spin_unlock(&journal->j_list_lock); + cond_resched(); + + if (*nr_to_scan && journal->j_shrink_transaction) + goto again; +out: + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, + nr_freed, next_tid); + + return nr_freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If + * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a + * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some + * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. + * + * Called with j_list_lock held. + */ +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, + enum jbd2_shrink_type type) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool released; + + WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + return; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + journal_shrink_one_cp_list(transaction->t_checkpoint_list, + type, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!released) + return; + } while (transaction != last_transaction); +} + +/* + * Remove buffers from all checkpoint lists as journal is aborted and we just + * need to free memory + */ +void jbd2_journal_destroy_checkpoint(journal_t *journal) +{ + /* + * We loop because __jbd2_journal_clean_checkpoint_list() may abort + * early due to a need of rescheduling. + */ + while (1) { + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + break; + } + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); + spin_unlock(&journal->j_list_lock); + cond_resched(); + } +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. + * + * This function is called with j_list_lock held. + */ +int __jbd2_journal_remove_checkpoint(struct journal_head *jh) +{ + struct transaction_chp_stats_s *stats; + transaction_t *transaction; + journal_t *journal; + + JBUFFER_TRACE(jh, "entry"); + + transaction = jh->b_cp_transaction; + if (!transaction) { + JBUFFER_TRACE(jh, "not on transaction"); + return 0; + } + journal = transaction->t_journal; + + JBUFFER_TRACE(jh, "removing from transaction"); + + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + percpu_counter_dec(&journal->j_checkpoint_jh_count); + jbd2_journal_put_journal_head(jh); + + /* Is this transaction empty? */ + if (transaction->t_checkpoint_list) + return 0; + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! + * + * The locking here around t_state is a bit sleazy. + * See the comment at the end of jbd2_journal_commit_transaction(). + */ + if (transaction->t_state != T_FINISHED) + return 0; + + /* + * OK, that was the last buffer for the transaction, we can now + * safely remove this transaction from the log. + */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); + + __jbd2_journal_drop_transaction(journal, transaction); + jbd2_journal_free_transaction(transaction); + return 1; +} + +/* + * Check the checkpoint buffer and try to remove it from the checkpoint + * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if + * it frees the transaction, 0 otherwise. + * + * This function is called with j_list_lock held. + */ +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_transaction) + return -EBUSY; + if (!trylock_buffer(bh)) + return -EBUSY; + if (buffer_dirty(bh)) { + unlock_buffer(bh); + return -EBUSY; + } + unlock_buffer(bh); + + /* + * Buffer is clean and the IO has finished (we held the buffer + * lock) so the checkpoint is done. We can safely remove the + * buffer from this transaction. + */ + JBUFFER_TRACE(jh, "remove from checkpoint list"); + return __jbd2_journal_remove_checkpoint(jh); +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __jbd2_journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; + percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + + journal->j_shrink_transaction = NULL; + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + trace_jbd2_drop_transaction(journal, transaction); + + jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); +} diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c new file mode 100644 index 00000000000..1467f679074 --- /dev/null +++ b/fs/jbd2/revoke.c @@ -0,0 +1,743 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/revoke.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/log2.h> +#include <linux/hash.h> +#endif + +static struct kmem_cache *jbd2_revoke_record_cache; +static struct kmem_cache *jbd2_revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd2_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long long blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd2_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(transaction_t *, + struct list_head *, + struct buffer_head **, int *, + struct jbd2_revoke_record_s *); +static void flush_descriptor(journal_t *, struct buffer_head *, int); +#endif + +/* Utility functions to maintain the revoke table */ + +static inline int hash(journal_t *journal, unsigned long long block) +{ + return hash_64(block, journal->j_revoke->hash_shift); +} + +static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; + + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); + if (!record) + return -ENOMEM; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long long blocknr) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd2_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd2_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +void jbd2_journal_destroy_revoke_record_cache(void) +{ + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; +} + +void jbd2_journal_destroy_revoke_table_cache(void) +{ + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; +} + +int __init jbd2_journal_init_revoke_record_cache(void) +{ + J_ASSERT(!jbd2_revoke_record_cache); + jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); + + if (!jbd2_revoke_record_cache) { + pr_emerg("JBD2: failed to create revoke_record cache\n"); + return -ENOMEM; + } + return 0; +} + +int __init jbd2_journal_init_revoke_table_cache(void) +{ + J_ASSERT(!jbd2_revoke_table_cache); + jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, + SLAB_TEMPORARY); + if (!jbd2_revoke_table_cache) { + pr_emerg("JBD2: failed to create revoke_table cache\n"); + return -ENOMEM; + } + return 0; +} + +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +{ + int shift = 0; + int tmp = hash_size; + struct jbd2_revoke_table_s *table; + + table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; + + while((tmp >>= 1UL) != 0UL) + shift++; + + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + if (!table->hash_table) { + kmem_cache_free(jbd2_revoke_table_cache, table); + table = NULL; + goto out; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&table->hash_table[tmp]); + +out: + return table; +} + +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } + + kvfree(table->hash_table); + kmem_cache_free(jbd2_revoke_table_cache, table); +} + +/* Initialise the revoke table for a given journal to a given size. */ +int jbd2_journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); + + journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; + + journal->j_revoke = journal->j_revoke_table[1]; + + spin_lock_init(&journal->j_revoke_lock); + + return 0; + +fail1: + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + journal->j_revoke_table[0] = NULL; +fail0: + return -ENOMEM; +} + +/* Destroy a journal's revoke table. The table must already be empty! */ +void jbd2_journal_destroy_revoke(journal_t *journal) +{ + journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); +} + + +#ifdef __KERNEL__ + +/* + * jbd2_journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count + * by one. + */ + +int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, + struct buffer_head *bh_in) +{ + struct buffer_head *bh = NULL; + journal_t *journal; + struct block_device *bdev; + int err; + + might_sleep(); + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + bdev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } +#ifdef JBD2_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if (bh2 != bh && buffer_revokevalid(bh2)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, buffer_revoked(bh2)); + put_bh(bh2); + } + } +#endif + + if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { + if (!bh_in) + brelse(bh); + return -EIO; + } + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + if (!J_EXPECT_BH(bh, !buffer_revoked(bh), + "inconsistent data on disk")) { + if (!bh_in) + brelse(bh); + return -EIO; + } + set_buffer_revoked(bh); + set_buffer_revokevalid(bh); + if (bh_in) { + BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); + jbd2_journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + handle->h_revoke_credits--; + + jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + BUFFER_TRACE(bh_in, "exit"); + return err; +} + +/* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from jbd2_journal_get_write_access). + * + * We trust buffer_revoked() on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + */ +void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +{ + struct jbd2_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + struct buffer_head *bh = jh2bh(jh); + + jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_set_buffer_revokevalid(bh)) { + need_cancel = test_clear_buffer_revoked(bh); + } else { + need_cancel = 1; + clear_buffer_revoked(bh); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd2_debug(4, "cancelled existing revoke on " + "blocknr %llu\n", (unsigned long long)bh->b_blocknr); + spin_lock(&journal->j_revoke_lock); + list_del(&record->hash); + spin_unlock(&journal->j_revoke_lock); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } + +#ifdef JBD2_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); +#endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel) { + struct buffer_head *bh2; + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, + bh->b_size); + if (bh2) { + if (bh2 != bh) + clear_buffer_revoked(bh2); + __brelse(bh2); + } + } +} + +/* + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block_nonatomic(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + +/* jbd2_journal_switch_revoke_table table select j_revoke for next + * transaction we do not want to suspend any processing until all + * revokes are written -bzzz + */ +void jbd2_journal_switch_revoke_table(journal_t *journal) +{ + int i; + + if (journal->j_revoke == journal->j_revoke_table[0]) + journal->j_revoke = journal->j_revoke_table[1]; + else + journal->j_revoke = journal->j_revoke_table[0]; + + for (i = 0; i < journal->j_revoke->hash_size; i++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); +} + +/* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + */ +void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs) +{ + journal_t *journal = transaction->t_journal; + struct buffer_head *descriptor; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + + /* select revoke table for committing transaction */ + revoke = journal->j_revoke == journal->j_revoke_table[0] ? + journal->j_revoke_table[1] : journal->j_revoke_table[0]; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s *) + hash_list->next; + write_one_revoke_record(transaction, log_bufs, + &descriptor, &offset, record); + count++; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset); + jbd2_debug(1, "Wrote %d revoke records\n", count); +} + +/* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + +static void write_one_revoke_record(transaction_t *transaction, + struct list_head *log_bufs, + struct buffer_head **descriptorp, + int *offsetp, + struct jbd2_revoke_record_s *record) +{ + journal_t *journal = transaction->t_journal; + int csum_size = 0; + struct buffer_head *descriptor; + int sz, offset; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + jbd2_journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Do we need to leave space at the end for a checksum? */ + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_block_tail); + + if (jbd2_has_feature_64bit(journal)) + sz = 8; + else + sz = 4; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset + sz > journal->j_blocksize - csum_size) { + flush_descriptor(journal, descriptor, offset); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = jbd2_journal_get_descriptor_buffer(transaction, + JBD2_REVOKE_BLOCK); + if (!descriptor) + return; + + /* Record it so that we can wait for IO completion later */ + BUFFER_TRACE(descriptor, "file in log_bufs"); + jbd2_file_log_bh(log_bufs, descriptor); + + offset = sizeof(jbd2_journal_revoke_header_t); + *descriptorp = descriptor; + } + + if (jbd2_has_feature_64bit(journal)) + * ((__be64 *)(&descriptor->b_data[offset])) = + cpu_to_be64(record->blocknr); + else + * ((__be32 *)(&descriptor->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += sz; + + *offsetp = offset; +} + +/* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + +static void flush_descriptor(journal_t *journal, + struct buffer_head *descriptor, + int offset) +{ + jbd2_journal_revoke_header_t *header; + + if (is_journal_aborted(journal)) + return; + + header = (jbd2_journal_revoke_header_t *)descriptor->b_data; + header->r_count = cpu_to_be32(offset); + jbd2_descriptor_block_csum_set(journal, descriptor); + + set_buffer_jwrite(descriptor); + BUFFER_TRACE(descriptor, "write"); + set_buffer_dirty(descriptor); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); +} +#endif + +/* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + +/* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + +int jbd2_journal_set_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); +} + +/* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + +int jbd2_journal_test_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; +} + +/* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + +void jbd2_journal_clear_revoke(journal_t *journal) +{ + int i; + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add the JBD2 journaling layer recovery and commit functionality from the Linux 6.18 kernel ext4 filesystem driver. recovery.c handles: - Journal recovery after unclean shutdown - Transaction replay and verification - Descriptor block parsing - Revoke block processing commit.c handles: - Transaction commit processing - Descriptor block writing - Data and metadata buffer submission - Checksum calculation for journal blocks These files are needed for journal integrity and crash recovery in the ext4 filesystem. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/jbd2/commit.c | 1160 ++++++++++++++++++++++++++++++++++++++++++++ fs/jbd2/recovery.c | 996 +++++++++++++++++++++++++++++++++++++ 2 files changed, 2156 insertions(+) create mode 100644 fs/jbd2/commit.c create mode 100644 fs/jbd2/recovery.c diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c new file mode 100644 index 00000000000..7203d2d2624 --- /dev/null +++ b/fs/jbd2/commit.c @@ -0,0 +1,1160 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/commit.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/jiffies.h> +#include <linux/crc32.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/bitops.h> +#include <trace/events/jbd2.h> + +/* + * IO end handler for temporary buffer_heads handling writes to the journal. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + struct buffer_head *orig_bh = bh->b_private; + + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + if (orig_bh) { + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); + smp_mb__after_atomic(); + wake_up_bit(&orig_bh->b_state, BH_Shadow); + } + unlock_buffer(bh); +} + +/* + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct folio *folio; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + folio = bh->b_folio; + if (folio->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (!folio_trylock(folio)) + goto nope; + + folio_get(folio); + __brelse(bh); + try_to_free_buffers(folio); + folio_unlock(folio); + folio_put(folio); + return; + +nope: + __brelse(bh); +} + +static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct commit_header *h; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + h = (struct commit_header *)(bh->b_data); + h->h_chksum_type = 0; + h->h_chksum_size = 0; + h->h_chksum[0] = 0; + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); + h->h_chksum[0] = cpu_to_be32(csum); +} + +/* + * Done it all: now submit the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) +{ + struct commit_header *tmp; + struct buffer_head *bh; + struct timespec64 now; + blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; + + *cbh = NULL; + + if (is_journal_aborted(journal)) + return 0; + + bh = jbd2_journal_get_descriptor_buffer(commit_transaction, + JBD2_COMMIT_BLOCK); + if (!bh) + return 1; + + tmp = (struct commit_header *)bh->b_data; + ktime_get_coarse_real_ts64(&now); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); + + if (jbd2_has_feature_checksum(journal)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); + } + jbd2_commit_block_csum_set(journal, bh); + + BUFFER_TRACE(bh, "submit commit block"); + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_has_feature_async_commit(journal)) + write_flags |= REQ_PREFLUSH | REQ_FUA; + + submit_bh(write_flags, bh); + *cbh = bh; + return 0; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + + return ret; +} + +/* Send all the data buffers related to an inode */ +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) +{ + if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + return 0; + + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + return journal->j_submit_inode_data_buffers(jinode); + +} +EXPORT_SYMBOL(jbd2_submit_inode_data); + +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) +{ + if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || + !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + return 0; + return filemap_fdatawait_range_keep_errors( + jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, + jinode->i_dirty_end); +} +EXPORT_SYMBOL(jbd2_wait_inode_data); + +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode; + int err, ret = 0; + + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WRITE_DATA)) + continue; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + /* submit the inode data buffers. */ + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + if (journal->j_submit_inode_data_buffers) { + err = journal->j_submit_inode_data_buffers(jinode); + if (!ret) + ret = err; + } + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + spin_unlock(&journal->j_list_lock); + return ret; +} + +int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + + return filemap_fdatawait_range_keep_errors(mapping, + jinode->i_dirty_start, + jinode->i_dirty_end); +} + +/* + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * + */ +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; + + /* For locking, see the comment in journal_submit_data_buffers() */ + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WAIT_DATA)) + continue; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + /* wait for the inode data buffers writeout. */ + if (journal->j_finish_inode_data_buffers) { + err = journal->j_finish_inode_data_buffers(jinode); + if (!ret) + ret = err; + } + cond_resched(); + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; + jinode->i_dirty_start = 0; + jinode->i_dirty_end = 0; + } + } + spin_unlock(&journal->j_list_lock); + + return ret; +} + +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + char *addr; + __u32 checksum; + + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); + + return checksum; +} + +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, + unsigned long long block) +{ + tag->t_blocknr = cpu_to_be32(block & (u32)~0); + if (jbd2_has_feature_64bit(j)) + tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); +} + +static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, + struct buffer_head *bh, __u32 sequence) +{ + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; + __u8 *addr; + __u32 csum32; + __be32 seq; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + seq = cpu_to_be32(sequence); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); + kunmap_local(addr); + + if (jbd2_has_feature_csum3(j)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); +} +/* + * jbd2_journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void jbd2_journal_commit_transaction(journal_t *journal) +{ + struct transaction_stats_s stats; + transaction_t *commit_transaction; + struct journal_head *jh; + struct buffer_head *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int escape; + int err; + unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; + char *tagp = NULL; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; + struct blk_plug plug; + /* Tail of the journal */ + unsigned long first_block; + tid_t first_tid; + int update_tail; + int csum_size = 0; + LIST_HEAD(io_bufs); + LIST_HEAD(log_bufs); + + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_block_tail); + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + /* Do we need to erase the effects of a prior jbd2_journal_flush? */ + if (journal->j_flags & JBD2_FLUSHED) { + jbd2_debug(3, "super block updated\n"); + mutex_lock_io(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, 0); + mutex_unlock(&journal->j_checkpoint_mutex); + } else { + jbd2_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; + while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + finish_wait(&journal->j_fc_wait, &wait); + /* + * TODO: by blocking fast commits here, we are increasing + * fsync() latency slightly. Strictly speaking, we don't need + * to block fast commits until the transaction enters T_FLUSH + * state. So an optimization is possible where we block new fast + * commits here and wait for existing ones to complete + * just before we enter T_FLUSH. That way, the existing fast + * commits and this full commit can proceed parallely. + */ + } + write_unlock(&journal->j_state_lock); + + commit_transaction = journal->j_running_transaction; + + trace_jbd2_start_commit(journal, commit_transaction); + jbd2_debug(1, "JBD2: starting commit of transaction %d\n", + commit_transaction->t_tid); + + write_lock(&journal->j_state_lock); + journal->j_fc_off = 0; + J_ASSERT(commit_transaction->t_state == T_RUNNING); + commit_transaction->t_state = T_LOCKED; + + trace_jbd2_commit_locking(journal, commit_transaction); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_request_delay = 0; + stats.run.rs_locked = jiffies; + if (commit_transaction->t_requested) + stats.run.rs_request_delay = + jbd2_time_diff(commit_transaction->t_requested, + stats.run.rs_locked); + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); + + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); + + commit_transaction->t_state = T_SWITCH; + + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a jbd2_journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple jbd2_journal_get_write_access() calls to the same + * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + spin_lock(&jh->b_state_lock); + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + spin_unlock(&jh->b_state_lock); + } + jbd2_journal_refile_buffer(journal, jh); + } + + write_unlock(&journal->j_state_lock); + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); + spin_unlock(&journal->j_list_lock); + + jbd2_debug(3, "JBD2: commit phase 1\n"); + + /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + + /* + * Switch to a new revoke table. + */ + jbd2_journal_switch_revoke_table(journal); + + write_lock(&journal->j_state_lock); + /* + * Reserved credits cannot be claimed anymore, free them + */ + atomic_sub(atomic_read(&journal->j_reserved_credits), + &commit_transaction->t_outstanding_credits); + + trace_jbd2_commit_flushing(journal, commit_transaction); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); + + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + start_time = ktime_get(); + commit_transaction->t_log_start = journal->j_head; + wake_up_all(&journal->j_wait_transaction_locked); + write_unlock(&journal->j_state_lock); + + jbd2_debug(3, "JBD2: commit phase 2a\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + err = journal_submit_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + + blk_start_plug(&plug); + jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); + + jbd2_debug(3, "JBD2: commit phase 2b\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + write_lock(&journal->j_state_lock); + commit_transaction->t_state = T_COMMIT; + write_unlock(&journal->j_state_lock); + + trace_jbd2_commit_logging(journal, commit_transaction); + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = commit_transaction->t_nr_buffers; + stats.run.rs_blocks_logged = 0; + + J_ASSERT(commit_transaction->t_nr_buffers <= + atomic_read(&commit_transaction->t_outstanding_credits)); + + bufs = 0; + descriptor = NULL; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it. */ + + if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); + JBUFFER_TRACE(jh, "journal is aborting: refile"); + jbd2_buffer_abort_trigger(jh, + jh->b_frozen_data ? + jh->b_frozen_triggers : + jh->b_triggers); + jbd2_journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + J_ASSERT (bufs == 0); + + jbd2_debug(4, "JBD2: get descriptor\n"); + + descriptor = jbd2_journal_get_descriptor_buffer( + commit_transaction, + JBD2_DESCRIPTOR_BLOCK); + if (!descriptor) { + jbd2_journal_abort(journal, -EIO); + continue; + } + + jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", + (unsigned long long)descriptor->b_blocknr, + descriptor->b_data); + tagp = &descriptor->b_data[sizeof(journal_header_t)]; + space_left = descriptor->b_size - + sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(descriptor); + set_buffer_dirty(descriptor); + wbuf[bufs++] = descriptor; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(descriptor, "ph3: file as descriptor"); + jbd2_file_log_bh(&log_bufs, descriptor); + } + + /* Where is the buffer to be written? */ + + err = jbd2_journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + jbd2_journal_abort(journal, err); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log. + */ + atomic_dec(&commit_transaction->t_outstanding_credits); + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* + * Make a temporary IO buffer with which to write it out + * (this will requeue the metadata buffer to BJ_Shadow). + */ + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + JBUFFER_TRACE(jh, "ph3: write metadata"); + escape = jbd2_journal_write_metadata_buffer(commit_transaction, + jh, &wbuf[bufs], blocknr); + jbd2_file_log_bh(&io_bufs, wbuf[bufs]); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (escape) + tag_flag |= JBD2_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JBD2_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be16(tag_flag); + jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], + commit_transaction->t_tid); + tagp += tag_bytes; + space_left -= tag_bytes; + bufs++; + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < tag_bytes + 16 + csum_size) { + + jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); +start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + + /* + * Compute checksum. + */ + if (jbd2_has_feature_checksum(journal)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, + bh); + } + cond_resched(); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); + err = 0; + } + + /* + * Get current oldest transaction in the log before we issue flush + * to the filesystem device. After the flush we can be sure that + * blocks of all older transactions are checkpointed to persistent + * storage and we will be safe to update journal start in the + * superblock with the numbers we get here. + */ + update_tail = + jbd2_journal_get_log_tail(journal, &first_tid, &first_block); + + write_lock(&journal->j_state_lock); + if (update_tail) { + long freed = first_block - journal->j_tail; + + if (first_block < journal->j_tail) + freed += journal->j_last - journal->j_first; + /* Update tail only if we free significant amount of space */ + if (freed < journal->j_max_transaction_buffers) + update_tail = 0; + } + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); + + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record and update the journal tail sequence. + */ + if ((commit_transaction->t_need_data_flush || update_tail) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev); + + /* Done it all: now write the commit record asynchronously. */ + if (jbd2_has_feature_async_commit(journal)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + jbd2_journal_abort(journal, err); + } + + blk_finish_plug(&plug); + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the io_bufs list. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd2_debug(3, "JBD2: commit phase 3\n"); + + while (!list_empty(&io_bufs)) { + struct buffer_head *bh = list_entry(io_bufs.prev, + struct buffer_head, + b_assoc_buffers); + + wait_on_buffer(bh); + cond_resched(); + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; + + /* + * The list contains temporary buffer heads created by + * jbd2_journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to refile the corresponding shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_buffer_jwrite(bh); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + J_ASSERT_BH(bh, !buffer_shadow(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd2_debug(3, "JBD2: commit phase 4\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + while (!list_empty(&log_bufs)) { + struct buffer_head *bh; + + bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); + wait_on_buffer(bh); + cond_resched(); + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + if (err) + jbd2_journal_abort(journal, err); + + jbd2_debug(3, "JBD2: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); + + if (!jbd2_has_feature_async_commit(journal)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + jbd2_journal_abort(journal, err); + } + if (cbh) + err = journal_wait_on_commit_record(journal, cbh); + stats.run.rs_blocks_logged++; + if (jbd2_has_feature_async_commit(journal) && + journal->j_flags & JBD2_BARRIER) { + blkdev_issue_flush(journal->j_dev); + } + + if (err) + jbd2_journal_abort(journal, err); + + WARN_ON_ONCE( + atomic_read(&commit_transaction->t_outstanding_credits) < 0); + + /* + * Now disk caches for filesystem device are flushed so we are safe to + * erase checkpointed transactions from the log by updating journal + * superblock. + */ + if (update_tail) + jbd2_update_log_tail(journal, first_tid, first_block); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd2_debug(3, "JBD2: commit phase 6\n"); + + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + int try_to_free = 0; + bool drop_ref; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); + spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + * + * We also know that the frozen data has already fired + * its triggers if they exist, so we can clear that too. + */ + if (jh->b_committed_data) { + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + } else if (jh->b_frozen_data) { + jbd2_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; + __jbd2_journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by jbd2_journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* + * A buffer which has been freed while still being journaled + * by a previous transaction, refile the buffer to BJ_Forget of + * the running transaction. If the just committed transaction + * contains "add to orphan" operation, we can completely + * invalidate the buffer now. We are rather through in that + * since the buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial page. + */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + struct address_space *mapping; + + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + + /* + * Block device buffers need to stay mapped all the + * time, so it is enough to clear buffer_jbddirty and + * buffer_freed bits. For the file mapping buffers (i.e. + * journalled data) we need to unmap buffer and clear + * more bits. We also need to be careful about the check + * because the data page mapping can get cleared under + * our hands. Note that if mapping == NULL, we don't + * need to make buffer unmapped because the page is + * already detached from the mapping and buffers cannot + * get reused. + */ + mapping = READ_ONCE(bh->b_folio->mapping); + if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* + * The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; + } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + drop_ref = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... + */ + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + + /* Done with this transaction! */ + + jbd2_debug(3, "JBD2: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); + + commit_transaction->t_start = jiffies; + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction statistics + */ + stats.ts_tid = commit_transaction->t_tid; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; + + commit_transaction->t_state = T_COMMIT_CALLBACK; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); + journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; + + write_unlock(&journal->j_state_lock); + + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + if (journal->j_fc_cleanup_callback) + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); + + trace_jbd2_end_commit(journal, commit_transaction); + jbd2_debug(1, "JBD2: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Check if the transaction can be dropped now that we are finished */ + if (commit_transaction->t_checkpoint_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_done_commit); + wake_up(&journal->j_fc_wait); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); +} diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c new file mode 100644 index 00000000000..cac8c2cd4a9 --- /dev/null +++ b/fs/jbd2/recovery.c @@ -0,0 +1,996 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/recovery.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/crc32.h> +#include <linux/blkdev.h> +#include <linux/string_choices.h> +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + unsigned long head_block; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +static void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static void do_readahead(journal_t *journal, unsigned int start) +{ + unsigned int max, nbufs, next; + unsigned long long blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_total_len) + max = journal->j_total_len; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + int err = jbd2_journal_bmap(journal, next, &blocknr); + + if (err) { + printk(KERN_ERR "JBD2: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + goto failed; + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + bh_readahead_batch(nbufs, bufs, 0); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + bh_readahead_batch(nbufs, bufs, 0); + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned long long blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_total_len) { + printk(KERN_ERR "JBD2: corrupted journal superblock\n"); + return -EFSCORRUPTED; + } + + err = jbd2_journal_bmap(journal, offset, &blocknr); + + if (err) { + printk(KERN_ERR "JBD2: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* + * If this is a brand new buffer, start readahead. + * Otherwise, we assume we are already reading it. + */ + bool need_readahead = !buffer_req(bh); + + bh_read_nowait(bh, 0); + if (need_readahead) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk(KERN_ERR "JBD2: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + +static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) +{ + struct jbd2_journal_block_tail *tail; + __be32 provided; + __u32 calculated; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + tail = (struct jbd2_journal_block_tail *)((char *)buf + + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); + provided = tail->t_checksum; + tail->t_checksum = 0; + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); + tail->t_checksum = provided; + + return provided == cpu_to_be32(calculated); +} + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(journal_t *journal, struct buffer_head *bh) +{ + char * tagp; + journal_block_tag_t tag; + int nr = 0, size = journal->j_blocksize; + int tag_bytes = journal_tag_bytes(journal); + + if (jbd2_journal_has_csum_v2or3(journal)) + size -= sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + tag_bytes) <= size) { + memcpy(&tag, tagp, sizeof(tag)); + + nr++; + tagp += tag_bytes; + if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) + tagp += 16; + + if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +static int fc_do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int expected_commit_id = info->end_transaction; + unsigned long next_fc_block; + struct buffer_head *bh; + int err = 0; + + next_fc_block = journal->j_fc_first; + if (!journal->j_fc_replay_callback) + return 0; + + while (next_fc_block <= journal->j_fc_last) { + jbd2_debug(3, "Fast commit replay: next block %ld\n", + next_fc_block); + err = jread(&bh, journal, next_fc_block); + if (err) { + jbd2_debug(3, "Fast commit replay: read error\n"); + break; + } + + err = journal->j_fc_replay_callback(journal, bh, pass, + next_fc_block - journal->j_fc_first, + expected_commit_id); + brelse(bh); + next_fc_block++; + if (err < 0 || err == JBD2_FC_REPLAY_STOP) + break; + err = 0; + } + + if (err) + jbd2_debug(3, "Fast commit replay failed, err = %d\n", err); + + return err; +} + +/** + * jbd2_journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int jbd2_journal_recover(journal_t *journal) +{ + int err, err2; + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. + */ + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", + be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + journal->j_head = be32_to_cpu(sb->s_head); + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd2_debug(1, "JBD2: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; + jbd2_debug(1, "JBD2: last transaction %d, head block %lu\n", + journal->j_transaction_sequence, journal->j_head); + + jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + err2 = jbd2_check_fs_dev_write_error(journal); + if (!err) + err = err2; + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev); + if (!err) + err = err2; + } + return err; +} + +/** + * jbd2_journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function doesn't appear to be exported.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int jbd2_journal_skip_recovery(journal_t *journal) +{ + int err; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD2: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + journal->j_head = journal->j_first; + } else { +#ifdef CONFIG_JBD2_DEBUG + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd2_debug(1, + "JBD2: ignoring %d transaction%s from the journal.\n", + dropped, str_plural(dropped)); +#endif + journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; + } + + journal->j_tail = 0; + return err; +} + +static inline unsigned long long read_tag_block(journal_t *journal, + journal_block_tag_t *tag) +{ + unsigned long long block = be32_to_cpu(tag->t_blocknr); + if (jbd2_has_feature_64bit(journal)) + block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; + return block; +} + +/* + * calc_chksums calculates the checksums for the blocks described in the + * descriptor block. + */ +static int calc_chksums(journal_t *journal, struct buffer_head *bh, + unsigned long *next_log_block, __u32 *crc32_sum) +{ + int i, num_blks, err; + unsigned long io_block; + struct buffer_head *obh; + + num_blks = count_tags(journal, bh); + /* Calculate checksum of the descriptor block. */ + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); + + for (i = 0; i < num_blks; i++) { + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + printk(KERN_ERR "JBD2: IO error %d recovering block " + "%lu in log\n", err, io_block); + return 1; + } else { + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } + put_bh(obh); + } + return 0; +} + +static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) +{ + struct commit_header *h; + __be32 provided; + __u32 calculated; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + h = buf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); + h->h_chksum[0] = provided; + + return provided == cpu_to_be32(calculated); +} + +static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) +{ + struct commit_header *h; + __be32 provided; + __u32 calculated; + void *tmpbuf; + + tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL); + if (!tmpbuf) + return false; + + memcpy(tmpbuf, buf, sizeof(struct commit_header)); + h = tmpbuf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); + kfree(tmpbuf); + + return provided == cpu_to_be32(calculated); +} + +static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + journal_block_tag3_t *tag3, + void *buf, __u32 sequence) +{ + __u32 csum32; + __be32 seq; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + seq = cpu_to_be32(sequence); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); + + if (jbd2_has_feature_csum3(j)) + return tag3->t_checksum == cpu_to_be32(csum32); + else + return tag->t_checksum == cpu_to_be16(csum32); +} + +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID) +{ + char *tagp; + int flags; + int ret = 0; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + int err; + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + ret = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + ret = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return ret; +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block, head_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head *bh = NULL; + unsigned int sequence; + int blocktype; + __u32 crc32_sum = ~0; /* Transactional Checksums */ + bool need_check_commit_time = false; + __u64 last_trans_commit_time = 0, commit_time; + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + head_block = next_log_block; + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } + + jbd2_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + cond_resched(); + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) + break; + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd2_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) + break; + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JBD2_DESCRIPTOR_BLOCK: + /* Verify checksum first */ + if (!jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { + /* + * PASS_SCAN can see stale blocks due to lazy + * journal init. Don't error out on those yet. + */ + if (pass != PASS_SCAN) { + pr_err("JBD2: Invalid checksum recovering block %lu in log\n", + next_log_block); + err = -EFSBADCRC; + goto failed; + } + need_check_commit_time = true; + jbd2_debug(1, + "invalid descriptor block found in %lu\n", + next_log_block); + } + + /* If it is a valid descriptor block, replay it + * in pass REPLAY; if journal_checksums enabled, then + * calculate checksums in PASS_SCAN, otherwise, + * just skip over the blocks it describes. */ + if (pass != PASS_REPLAY) { + if (pass == PASS_SCAN && + jbd2_has_feature_checksum(journal) && + !info->end_transaction) { + if (calc_chksums(journal, bh, + &next_log_block, + &crc32_sum)) + break; + continue; + } + next_log_block += count_tags(journal, bh); + wrap(journal, next_log_block); + continue; + } + + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; + } + + continue; + + case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + + /* How to differentiate between interrupted commit + * and journal corruption ? + * + * {nth transaction} + * Checksum Verification Failed + * | + * ____________________ + * | | + * async_commit sync_commit + * | | + * | GO TO NEXT "Journal Corruption" + * | TRANSACTION + * | + * {(n+1)th transanction} + * | + * _______|______________ + * | | + * Commit block found Commit block not found + * | | + * "Journal Corruption" | + * _____________|_________ + * | | + * nth trans corrupt OR nth trans + * and (n+1)th interrupted interrupted + * before commit block + * could reach the disk. + * (Cannot find the difference in above + * mentioned conditions. Hence assume + * "Interrupted Commit".) + */ + commit_time = be64_to_cpu( + ((struct commit_header *)bh->b_data)->h_commit_sec); + /* + * If need_check_commit_time is set, it means we are in + * PASS_SCAN and csum verify failed before. If + * commit_time is increasing, it's the same journal, + * otherwise it is stale journal block, just end this + * recovery. + */ + if (need_check_commit_time) { + if (commit_time >= last_trans_commit_time) { + pr_err("JBD2: Invalid checksum found in transaction %u\n", + next_commit_ID); + err = -EFSBADCRC; + goto failed; + } + ignore_crc_mismatch: + /* + * It likely does not belong to same journal, + * just end this recovery with success. + */ + jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", + next_commit_ID); + goto done; + } + + /* + * Found an expected commit block: if checksums + * are present, verify them in PASS_SCAN; else not + * much to do other than move on to the next sequence + * number. + */ + if (jbd2_has_feature_checksum(journal)) { + struct commit_header *cbh = + (struct commit_header *)bh->b_data; + unsigned found_chksum = + be32_to_cpu(cbh->h_chksum[0]); + + if (info->end_transaction) { + journal->j_failed_commit = + info->end_transaction; + break; + } + + /* Neither checksum match nor unused? */ + if (!((crc32_sum == found_chksum && + cbh->h_chksum_type == + JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) || + (cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0))) + goto chksum_error; + + crc32_sum = ~0; + goto chksum_ok; + } + + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; + } + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; + } + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; + next_commit_ID++; + continue; + + case JBD2_REVOKE_BLOCK: + /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* + * Check revoke block crc in pass_scan, if csum verify + * failed, check commit block time later. + */ + if (pass == PASS_SCAN && + !jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { + jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n", + next_log_block); + need_check_commit_time = true; + } + + err = scan_revoke_records(journal, pass, bh, + next_commit_ID, info); + if (err) + goto failed; + continue; + + default: + jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + goto done; + } + } + + done: + brelse(bh); + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) { + if (!info->end_transaction) + info->end_transaction = next_commit_ID; + if (!info->head_block) + info->head_block = head_block; + } else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk(KERN_ERR "JBD2: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) { + err = fc_do_one_pass(journal, info, pass); + if (err) + success = err; + } + + return success; + + failed: + brelse(bh); + return err; +} + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) +{ + jbd2_journal_revoke_header_t *header; + int offset, max; + unsigned csum_size = 0; + __u32 rcount; + int record_len = 4; + + header = (jbd2_journal_revoke_header_t *) bh->b_data; + offset = sizeof(jbd2_journal_revoke_header_t); + rcount = be32_to_cpu(header->r_count); + + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_block_tail); + if (rcount > journal->j_blocksize - csum_size) + return -EINVAL; + max = rcount; + + if (jbd2_has_feature_64bit(journal)) + record_len = 8; + + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + + while (offset + record_len <= max) { + unsigned long long blocknr; + int err; + + if (record_len == 4) + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + else + blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); + offset += record_len; + err = jbd2_journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + } + return 0; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add the JBD2 journaling layer transaction management from the Linux 6.18 kernel ext4 filesystem driver. transaction.c handles: - Transaction lifecycle (start, stop, restart) - Handle management for filesystem operations - Buffer attachment to transactions - Transaction state machine - Credit reservation and accounting - Metadata and data buffer journaling This is the core transaction API that ext4 uses to ensure atomic updates to filesystem metadata. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/jbd2/transaction.c | 2751 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2751 insertions(+) create mode 100644 fs/jbd2/transaction.c diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c new file mode 100644 index 00000000000..3e510564de6 --- /dev/null +++ b/fs/jbd2/transaction.c @@ -0,0 +1,2751 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/transaction.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/hrtimer.h> +#include <linux/backing-dev.h> +#include <linux/bug.h> +#include <linux/module.h> +#include <linux/sched/mm.h> + +#include <trace/events/jbd2.h> + +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); + +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ + J_ASSERT(!transaction_cache); + transaction_cache = kmem_cache_create("jbd2_transaction_s", + sizeof(transaction_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (!transaction_cache) { + pr_emerg("JBD2: failed to create transaction cache\n"); + return -ENOMEM; + } + return 0; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ + if (unlikely(ZERO_OR_NULL_PTR(transaction))) + return; + kmem_cache_free(transaction_cache, transaction); +} + +/* + * jbd2_get_transaction: obtain a new transaction_t object. + * + * Simply initialise a new transaction. Initialize it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + */ + +static void jbd2_get_transaction(journal_t *journal, + transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, + journal->j_transaction_overhead_buffers + + atomic_read(&journal->j_reserved_credits)); + atomic_set(&transaction->t_outstanding_revokes, 0); + atomic_set(&transaction->t_handle_count, 0); + INIT_LIST_HEAD(&transaction->t_inode_list); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + transaction->t_max_wait = 0; + transaction->t_start = jiffies; + transaction->t_requested = 0; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * t_max_wait is carefully updated here with use of atomic compare exchange. + * Note that there could be multiplre threads trying to do this simultaneously + * hence using cmpxchg to avoid any use of locks in this case. + */ +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) +{ + unsigned long oldts, newts; + + if (time_after(transaction->t_start, ts)) { + newts = jbd2_time_diff(ts, transaction->t_start); + oldts = READ_ONCE(transaction->t_max_wait); + while (oldts < newts) + oldts = cmpxchg(&transaction->t_max_wait, oldts, newts); + } +} + +/* + * Wait until running transaction passes to T_FLUSH state and new transaction + * can thus be started. Also starts the commit if needed. The function expects + * running transaction to exist and releases j_state_lock. + */ +static void wait_transaction_locked(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + int need_to_start; + tid_t tid = journal->j_running_transaction->t_tid; + + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + jbd2_might_wait_for_commit(journal); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + +/* + * Wait until running transaction transitions from T_SWITCH to T_FLUSH + * state and new transaction can thus be started. The function releases + * j_state_lock. + */ +static void wait_transaction_switching(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + + if (WARN_ON(!journal->j_running_transaction || + journal->j_running_transaction->t_state != T_SWITCH)) { + read_unlock(&journal->j_state_lock); + return; + } + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + read_unlock(&journal->j_state_lock); + /* + * We don't call jbd2_might_wait_for_commit() here as there's no + * waiting for outstanding handles happening anymore in T_SWITCH state + * and handling of reserved handles actually relies on that for + * correctness. + */ + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + +static void sub_reserved_credits(journal_t *journal, int blocks) +{ + atomic_sub(blocks, &journal->j_reserved_credits); + wake_up(&journal->j_wait_reserved); +} + +/* Maximum number of blocks for user transaction payload */ +static int jbd2_max_user_trans_buffers(journal_t *journal) +{ + return journal->j_max_transaction_buffers - + journal->j_transaction_overhead_buffers; +} + +/* + * Wait until we can add credits for handle to the running transaction. Called + * with j_state_lock held for reading. Returns 0 if handle joined the running + * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and + * caller must retry. + * + * Note: because j_state_lock may be dropped depending on the return + * value, we need to fake out sparse so ti doesn't complain about a + * locking imbalance. Callers of add_transaction_credits will need to + * make a similar accomodation. + */ +static int add_transaction_credits(journal_t *journal, int blocks, + int rsv_blocks) +__must_hold(&journal->j_state_lock) +{ + transaction_t *t = journal->j_running_transaction; + int needed; + int total = blocks + rsv_blocks; + + /* + * If the current transaction is locked down for commit, wait + * for the lock to be released. + */ + if (t->t_state != T_RUNNING) { + WARN_ON_ONCE(t->t_state >= T_FLUSH); + wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ + return 1; + } + + /* + * If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log space. + */ + needed = atomic_add_return(total, &t->t_outstanding_credits); + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, + * then start to commit it: we can then go back and + * attach this handle to a new transaction. + */ + atomic_sub(total, &t->t_outstanding_credits); + + /* + * Is the number of reserved credits in the current transaction too + * big to fit this handle? Wait until reserved credits are freed. + */ + if (atomic_read(&journal->j_reserved_credits) + total > + jbd2_max_user_trans_buffers(journal)) { + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + total <= + jbd2_max_user_trans_buffers(journal)); + __acquire(&journal->j_state_lock); /* fake out sparse */ + return 1; + } + + wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ + return 1; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + */ + if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + write_lock(&journal->j_state_lock); + if (jbd2_log_space_left(journal) < + journal->j_max_transaction_buffers) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + __acquire(&journal->j_state_lock); /* fake out sparse */ + return 1; + } + + /* No reservation? We are done... */ + if (!rsv_blocks) + return 0; + + needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); + /* We allow at most half of a transaction to be reserved */ + if (needed > jbd2_max_user_trans_buffers(journal) / 2) { + sub_reserved_credits(journal, rsv_blocks); + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + rsv_blocks + <= jbd2_max_user_trans_buffers(journal) / 2); + __acquire(&journal->j_state_lock); /* fake out sparse */ + return 1; + } + return 0; +} + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle, + gfp_t gfp_mask) +{ + transaction_t *transaction, *new_transaction = NULL; + int blocks = handle->h_total_credits; + int rsv_blocks = 0; + unsigned long ts = jiffies; + + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_total_credits; + + /* + * Limit the number of reserved credits to 1/2 of maximum transaction + * size and limit the number of total credits to not exceed maximum + * transaction size per operation. + */ + if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 || + rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) { + printk(KERN_ERR "JBD2: %s wants too many credits " + "credits:%d rsv_credits:%d max:%d\n", + current->comm, blocks, rsv_blocks, + jbd2_max_user_trans_buffers(journal)); + WARN_ON(1); + return -ENOSPC; + } + +alloc_transaction: + /* + * This check is racy but it is just an optimization of allocating new + * transaction early if there are high chances we'll need it. If we + * guess wrong, we'll retry or free unused transaction. + */ + if (!data_race(journal->j_running_transaction)) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; + new_transaction = kmem_cache_zalloc(transaction_cache, + gfp_mask); + if (!new_transaction) + return -ENOMEM; + } + + jbd2_debug(3, "New handle %p going live.\n", handle); + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ +repeat: + read_lock(&journal->j_state_lock); + BUG_ON(journal->j_flags & JBD2_UNMOUNT); + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { + read_unlock(&journal->j_state_lock); + jbd2_journal_free_transaction(new_transaction); + return -EROFS; + } + + /* + * Wait on the journal's transaction barrier if necessary. Specifically + * we allow reserved handles to proceed because otherwise commit could + * deadlock on page writeback not being able to complete. + */ + if (!handle->h_reserved && journal->j_barrier_count) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + read_unlock(&journal->j_state_lock); + if (!new_transaction) + goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction && + (handle->h_reserved || !journal->j_barrier_count)) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; + } + write_unlock(&journal->j_state_lock); + goto repeat; + } + + transaction = journal->j_running_transaction; + + if (!handle->h_reserved) { + /* We may have dropped j_state_lock - restart in that case */ + if (add_transaction_credits(journal, blocks, rsv_blocks)) { + /* + * add_transaction_credits releases + * j_state_lock on a non-zero return + */ + __release(&journal->j_state_lock); + goto repeat; + } + } else { + /* + * We have handle reserved so we are allowed to join T_LOCKED + * transaction and we don't have to check for transaction size + * and journal space. But we still have to wait while running + * transaction is being switched to a committing one as it + * won't wait for any handles anymore. + */ + if (transaction->t_state == T_SWITCH) { + wait_transaction_switching(journal); + goto repeat; + } + sub_reserved_credits(journal, blocks); + handle->h_reserved = 0; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. + */ + update_t_max_wait(transaction, ts); + handle->h_transaction = transaction; + handle->h_requested_credits = blocks; + handle->h_revoke_credits_requested = handle->h_revoke_credits; + handle->h_start_jiffies = jiffies; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); + jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + handle, blocks, + atomic_read(&transaction->t_outstanding_credits), + jbd2_log_space_left(journal)); + read_unlock(&journal->j_state_lock); + current->journal_info = handle; + + rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); + jbd2_journal_free_transaction(new_transaction); + /* + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); + return 0; +} + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd2_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + handle->h_total_credits = nblocks; + handle->h_ref = 1; + + return handle; +} + +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + nblocks += DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + if (rsv_blocks) { + handle_t *rsv_handle; + + rsv_handle = new_handle(rsv_blocks); + if (!rsv_handle) { + jbd2_free_handle(handle); + return ERR_PTR(-ENOMEM); + } + rsv_handle->h_reserved = 1; + rsv_handle->h_journal = journal; + handle->h_rsv_handle = rsv_handle; + } + handle->h_revoke_credits = revoke_records; + + err = start_this_handle(journal, handle, gfp_mask); + if (err < 0) { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + jbd2_free_handle(handle); + return ERR_PTR(err); + } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, nblocks); + + return handle; +} +EXPORT_SYMBOL(jbd2__journal_start); + + +/** + * jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); +} +EXPORT_SYMBOL(jbd2_journal_start); + +static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) +{ + journal_t *journal = handle->h_journal; + + WARN_ON(!handle->h_reserved); + sub_reserved_credits(journal, handle->h_total_credits); + if (t) + atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); +} + +void jbd2_journal_free_reserved(handle_t *handle) +{ + journal_t *journal = handle->h_journal; + + /* Get j_state_lock to pin running transaction if it exists */ + read_lock(&journal->j_state_lock); + __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); + read_unlock(&journal->j_state_lock); + jbd2_free_handle(handle); +} +EXPORT_SYMBOL(jbd2_journal_free_reserved); + +/** + * jbd2_journal_start_reserved() - start reserved handle + * @handle: handle to start + * @type: for handle statistics + * @line_no: for handle statistics + * + * Start handle that has been previously reserved with jbd2_journal_reserve(). + * This attaches @handle to the running transaction (or creates one if there's + * not transaction running). Unlike jbd2_journal_start() this function cannot + * block on journal commit, checkpointing, or similar stuff. It can block on + * memory allocation or frozen journal though. + * + * Return 0 on success, non-zero on error - handle is freed in that case. + */ +int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, + unsigned int line_no) +{ + journal_t *journal = handle->h_journal; + int ret = -EIO; + + if (WARN_ON(!handle->h_reserved)) { + /* Someone passed in normal handle? Just stop it. */ + jbd2_journal_stop(handle); + return ret; + } + /* + * Usefulness of mixing of reserved and unreserved handles is + * questionable. So far nobody seems to need it so just error out. + */ + if (WARN_ON(current->journal_info)) { + jbd2_journal_free_reserved(handle); + return ret; + } + + handle->h_journal = NULL; + /* + * GFP_NOFS is here because callers are likely from writeback or + * similarly constrained call sites + */ + ret = start_this_handle(journal, handle, GFP_NOFS); + if (ret < 0) { + handle->h_journal = journal; + jbd2_journal_free_reserved(handle); + return ret; + } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, handle->h_total_credits); + return 0; +} +EXPORT_SYMBOL(jbd2_journal_start_reserved); + +/** + * jbd2_journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * @revoke_records: number of revoke records to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modifications in advance, but can + * extend its credit if it needs more. + * + * jbd2_journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int result; + int wanted; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + result = 1; + + read_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (transaction->t_state != T_RUNNING) { + jbd2_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + nblocks += DIV_ROUND_UP( + handle->h_revoke_credits_requested + revoke_records, + journal->j_revoke_records_per_block) - + DIV_ROUND_UP( + handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); + wanted = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); + + if (wanted > journal->j_max_transaction_buffers) { + jbd2_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); + goto error_out; + } + + trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, + transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_total_credits, + nblocks); + + handle->h_total_credits += nblocks; + handle->h_requested_credits += nblocks; + handle->h_revoke_credits += revoke_records; + handle->h_revoke_credits_requested += revoke_records; + result = 0; + + jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks); +error_out: + read_unlock(&journal->j_state_lock); + return result; +} + +static void stop_this_handle(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int revokes; + + J_ASSERT(journal_current_handle() == handle); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + current->journal_info = NULL; + /* + * Subtract necessary revoke descriptor blocks from handle credits. We + * take care to account only for revoke descriptor blocks the + * transaction will really need as large sequences of transactions with + * small numbers of revokes are relatively common. + */ + revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; + if (revokes) { + int t_revokes, revoke_descriptors; + int rr_per_blk = journal->j_revoke_records_per_block; + + WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) + > handle->h_total_credits); + t_revokes = atomic_add_return(revokes, + &transaction->t_outstanding_revokes); + revoke_descriptors = + DIV_ROUND_UP(t_revokes, rr_per_blk) - + DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); + handle->h_total_credits -= revoke_descriptors; + } + atomic_sub(handle->h_total_credits, + &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) + __jbd2_journal_unreserve_handle(handle->h_rsv_handle, + transaction); + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); +} + +/** + * jbd2__journal_restart() - restart a handle . + * @handle: handle to restart + * @nblocks: nr credits requested + * @revoke_records: number of revoke record credits requested + * @gfp_mask: memory allocation flags (for start_this_handle) + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the jbd2_journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to jbd2_journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capable of guaranteeing the requested number of + * credits. We preserve reserved handle if there's any attached to the + * passed in handle. + */ +int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, + gfp_t gfp_mask) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + tid_t tid; + int need_to_start; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + journal = transaction->t_journal; + tid = transaction->t_tid; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + jbd2_debug(2, "restarting handle %p\n", handle); + stop_this_handle(handle); + handle->h_transaction = NULL; + + /* + * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can + * get rid of pointless j_state_lock traffic like this. + */ + read_lock(&journal->j_state_lock); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + handle->h_total_credits = nblocks + + DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); + handle->h_revoke_credits = revoke_records; + ret = start_this_handle(journal, handle, gfp_mask); + trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, + ret ? 0 : handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_total_credits); + return ret; +} +EXPORT_SYMBOL(jbd2__journal_restart); + + +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + +/* + * Waits for any outstanding t_updates to finish. + * This is called with write j_state_lock held. + */ +void jbd2_journal_wait_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + + while (1) { + /* + * Note that the running transaction can get freed under us if + * this transaction is getting committed in + * jbd2_journal_commit_transaction() -> + * jbd2_journal_free_transaction(). This can only happen when we + * release j_state_lock -> schedule() -> acquire j_state_lock. + * Hence we should everytime retrieve new j_running_transaction + * value (after j_state_lock release acquire cycle), else it may + * lead to use-after-free of old freed transaction. + */ + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&transaction->t_updates)) { + finish_wait(&journal->j_wait_updates, &wait); + break; + } + write_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + write_lock(&journal->j_state_lock); + } +} + +/** + * jbd2_journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ +void jbd2_journal_lock_updates(journal_t *journal) +{ + jbd2_might_wait_for_commit(journal); + + write_lock(&journal->j_state_lock); + ++journal->j_barrier_count; + + /* Wait until there are no reserved handles */ + if (atomic_read(&journal->j_reserved_credits)) { + write_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) == 0); + write_lock(&journal->j_state_lock); + } + + /* Wait until there are no running t_updates */ + jbd2_journal_wait_updates(journal); + + write_unlock(&journal->j_state_lock); + + /* + * We have now established a barrier against other normal updates, but + * we also need to barrier against other jbd2_journal_lock_updates() calls + * to make sure that we serialise special journal-locked operations + * too. + */ + mutex_lock(&journal->j_barrier); +} + +/** + * jbd2_journal_unlock_updates () - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with jbd2_journal_lock_updates(). + * + * Should be called without the journal lock held. + */ +void jbd2_journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + mutex_unlock(&journal->j_barrier); + write_lock(&journal->j_state_lock); + --journal->j_barrier_count; + write_unlock(&journal->j_state_lock); + wake_up_all(&journal->j_wait_transaction_locked); +} + +static void warn_dirty_buffer(struct buffer_head *bh) +{ + printk(KERN_WARNING + "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bh->b_bdev, (unsigned long long)bh->b_blocknr); +} + +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + source = kmap_local_folio(bh->b_folio, bh_offset(bh)); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); + memcpy(jh->b_frozen_data, source, bh->b_size); + kunmap_local(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy) +{ + struct buffer_head *bh; + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + unsigned long start_lock, time_lock; + + journal = transaction->t_journal; + + jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + start_lock = jiffies; + lock_buffer(bh); + spin_lock(&jh->b_state_lock); + + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh) && jh->b_transaction) { + warn_dirty_buffer(bh); + /* + * We need to clean the dirty flag and we must do it under the + * buffer lock to be sure we don't race with running write-out. + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + /* + * The buffer is going to be added to BJ_Reserved list now and + * nothing guarantees jbd2_journal_dirty_metadata() will be + * ever called for it. So we need to set jbddirty bit here to + * make sure the buffer is dirtied and written out when the + * journaling machinery is done with it. + */ + set_buffer_jbddirty(bh); + } + + error = -EROFS; + if (is_handle_aborted(handle)) { + spin_unlock(&jh->b_state_lock); + unlock_buffer(bh); + goto out; + } + error = 0; + + /* + * The buffer is already part of this transaction if b_transaction or + * b_next_transaction points to it + */ + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) { + unlock_buffer(bh); + goto done; + } + + /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); + spin_lock(&journal->j_list_lock); + if (test_clear_buffer_dirty(bh)) { + /* + * Execute buffer dirty clearing and jh->b_transaction + * assignment under journal->j_list_lock locked to + * prevent bh being removed from checkpoint list if + * the buffer is in an intermediate state (not dirty + * and jh->b_transaction is NULL). + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + set_buffer_jbddirty(bh); + } + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + unlock_buffer(bh); + goto done; + } + unlock_buffer(bh); + + /* + * If there is already a copy-out version of this buffer, then we don't + * need to make another one + */ + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + goto attach_next; + } + + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); + + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + spin_unlock(&jh->b_state_lock); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } + + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + spin_unlock(&jh->b_state_lock); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS | __GFP_NOFAIL); + goto repeat; + } + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); + } +attach_next: + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() + */ + smp_wmb(); + jh->b_next_transaction = transaction; + +done: + spin_unlock(&jh->b_state_lock); + + /* + * If we are about to journal a buffer, then any revoke pending on it is + * no longer valid + */ + jbd2_journal_cancel_revoke(handle, jh); + +out: + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd2_free(frozen_buffer, bh->b_size); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, + bool undo) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + /* For undo access buffer must have data copied */ + if (undo && !jh->b_committed_data) + goto out; + if (READ_ONCE(jh->b_transaction) != handle->h_transaction && + READ_ONCE(jh->b_next_transaction) != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + +/** + * jbd2_journal_get_write_access() - notify intent to modify a buffer + * for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns: error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're ``write()ing`` a buffer which is also part of a shared mapping. + */ + +int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh; + journal_t *journal; + int rc; + + if (is_handle_aborted(handle)) + return -EROFS; + + journal = handle->h_transaction->t_journal; + if (jbd2_check_fs_dev_write_error(journal)) { + /* + * If the fs dev has writeback errors, it may have failed + * to async write out metadata buffers in the background. + * In this case, we could read old data from disk and write + * it out again, which may lead to on-disk filesystem + * inconsistency. Aborting journal can avoid it happen. + */ + jbd2_journal_abort(journal, -EIO); + return -EIO; + } + + if (jbd2_write_access_granted(handle, bh, false)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0); + jbd2_journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * jbd2_journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + int err; + + jbd2_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + journal = transaction->t_journal; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + if (jh->b_transaction == NULL) { + /* + * Previous jbd2_journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); + jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); + } + spin_unlock(&jh->b_state_lock); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + jbd2_journal_cancel_revoke(handle, jh); +out: + jbd2_journal_put_journal_head(jh); + return err; +} + +/** + * jbd2_journal_get_undo_access() - Notify intent to modify metadata with + * non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, jbd2_journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) +{ + int err; + struct journal_head *jh; + char *committed_data = NULL; + + if (is_handle_aborted(handle)) + return -EROFS; + + if (jbd2_write_access_granted(handle, bh, true)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) + committed_data = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS|__GFP_NOFAIL); + + spin_lock(&jh->b_state_lock); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + spin_unlock(&jh->b_state_lock); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + spin_unlock(&jh->b_state_lock); +out: + jbd2_journal_put_journal_head(jh); + if (unlikely(committed_data)) + jbd2_free(committed_data, bh->b_size); + return err; +} + +/** + * jbd2_journal_set_triggers() - Add triggers for commit writeout + * @bh: buffer to trigger on + * @type: struct jbd2_buffer_trigger_type containing the trigger(s). + * + * Set any triggers on this journal_head. This is always safe, because + * triggers for a committing buffer will be saved off, and triggers for + * a running transaction will match the buffer in that transaction. + * + * Call with NULL to clear the triggers. + */ +void jbd2_journal_set_triggers(struct buffer_head *bh, + struct jbd2_buffer_trigger_type *type) +{ + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + + if (WARN_ON_ONCE(!jh)) + return; + jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); +} + +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, + struct jbd2_buffer_trigger_type *triggers) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!triggers || !triggers->t_frozen) + return; + + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); +} + +void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers) +{ + if (!triggers || !triggers->t_abort) + return; + + triggers->t_abort(triggers, jh2bh(jh)); +} + +/** + * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh; + int ret = 0; + + if (!buffer_jbd(bh)) + return -EUCLEAN; + + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + jbd2_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + + /* + * This and the following assertions are unreliable since we may see jh + * in inconsistent state unless we grab bh_state lock. But this is + * crucial to catch bugs so let's do a reliable check until the + * lockless handling is fully proven. + */ + if (data_race(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { + spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + spin_unlock(&jh->b_state_lock); + } + if (data_race(jh->b_modified == 1)) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (data_race(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { + spin_lock(&jh->b_state_lock); + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) + pr_err("JBD2: assertion failure: h_type=%u " + "h_line_no=%u block_no=%llu jlist=%u\n", + handle->h_type, handle->h_line_no, + (unsigned long long) bh->b_blocknr, + jh->b_jlist); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + spin_unlock(&jh->b_state_lock); + } + goto out; + } + + spin_lock(&jh->b_state_lock); + + if (is_handle_aborted(handle)) { + /* + * Check journal aborting with @jh->b_state_lock locked, + * since 'jh->b_transaction' could be replaced with + * 'jh->b_next_transaction' during old transaction + * committing if journal aborted, which may fail + * assertion on 'jh->b_frozen_data == NULL'. + */ + ret = -EROFS; + goto out_unlock_bh; + } + + journal = transaction->t_journal; + + if (jh->b_modified == 0) { + /* + * This buffer's got modified and becoming part + * of the transaction. This needs to be done + * once a transaction -bzzz + */ + if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { + ret = -ENOSPC; + goto out_unlock_bh; + } + jh->b_modified = 1; + handle->h_total_credits--; + } + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_ERR "JBD2: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } + goto out_unlock_bh; + } + + set_buffer_jbddirty(bh); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, + jh->b_transaction, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + jh->b_jlist); + WARN_ON(1); + ret = -EINVAL; + } + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + goto out_unlock_bh; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + spin_unlock(&jh->b_state_lock); +out: + JBUFFER_TRACE(jh, "exit"); + return ret; +} + +/** + * jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh; + int drop_reserve = 0; + int err = 0; + int was_modified = 0; + int wait_for_writeback = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + BUFFER_TRACE(bh, "entry"); + + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + __bforget(bh); + return 0; + } + + spin_lock(&jh->b_state_lock); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; + goto drop; + } + + /* keep track of whether or not this transaction modified us */ + was_modified = jh->b_modified; + + /* + * The buffer's going from the transaction, we must drop + * all references -bzzz + */ + jh->b_modified = 0; + + if (jh->b_transaction == transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + spin_lock(&journal->j_list_lock); + if (jh->b_cp_transaction) { + __jbd2_journal_temp_unlink_buffer(jh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); + } + spin_unlock(&journal->j_list_lock); + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ + + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { + spin_lock(&journal->j_list_lock); + jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; + } + } else { + /* + * Finally, if the buffer is not belongs to any + * transaction, we can just drop it now if it has no + * checkpoint. + */ + spin_lock(&journal->j_list_lock); + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); + goto drop; + } + + /* + * Otherwise, if the buffer has been written to disk, + * it is safe to remove the checkpoint and drop it. + */ + if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { + spin_unlock(&journal->j_list_lock); + goto drop; + } + + /* + * The buffer has not yet been written to disk. We should + * either clear the buffer or ensure that the ongoing I/O + * is completed, and attach this buffer to current + * transaction so that the buffer can be checkpointed only + * after the current transaction commits. + */ + clear_buffer_dirty(bh); + wait_for_writeback = 1; + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); + } +drop: + __brelse(bh); + spin_unlock(&jh->b_state_lock); + if (wait_for_writeback) + wait_on_buffer(bh); + jbd2_journal_put_journal_head(jh); + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_total_credits++; + } + return err; +} + +/** + * jbd2_journal_stop() - complete a transaction + * @handle: transaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * jbd2_journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a jbd2_journal_abort has been executed since the + * transaction began. + */ +int jbd2_journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int err = 0, wait_for_commit = 0; + tid_t tid; + pid_t pid; + + if (--handle->h_ref > 0) { + jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + if (is_handle_aborted(handle)) + return -EIO; + return 0; + } + if (!transaction) { + /* + * Handle is already detached from the transaction so there is + * nothing to do other than free the handle. + */ + memalloc_nofs_restore(handle->saved_alloc_context); + goto free_and_exit; + } + journal = transaction->t_journal; + tid = transaction->t_tid; + + if (is_handle_aborted(handle)) + err = -EIO; + + jbd2_debug(4, "Handle %p going down\n", handle); + trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, + tid, handle->h_type, handle->h_line_no, + jiffies - handle->h_start_jiffies, + handle->h_sync, handle->h_requested_credits, + (handle->h_requested_credits - + handle->h_total_credits)); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... + * + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. + */ + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + read_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + read_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); + commit_time = min_t(u64, commit_time, + 1000*journal->j_max_batch_time); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + } + + if (handle->h_sync) + transaction->t_synchronous_commit = 1; + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the transaction is too + * old now. + */ + if (handle->h_sync || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + + jbd2_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + jbd2_log_start_commit(journal, tid); + + /* + * Special case: JBD2_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + wait_for_commit = 1; + } + + /* + * Once stop_this_handle() drops t_updates, the transaction could start + * committing on us and eventually disappear. So we must not + * dereference transaction pointer again after calling + * stop_this_handle(). + */ + stop_this_handle(handle); + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + +free_and_exit: + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + jbd2_free_handle(handle); + return err; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jh->b_state_lock is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jh->b_state_lock is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = NULL; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or + * t_reserved_list. If the caller is holding onto a copy of one of these + * pointers, it could go bad. Generally the caller needs to re-read the + * pointer from the transaction_t. + * + * Called under j_list_lock. + */ +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +{ + struct journal_head **list = NULL; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + lockdep_assert_held(&jh->b_state_lock); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != NULL); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (transaction && is_journal_aborted(transaction->t_journal)) + clear_buffer_jbddirty(bh); + else if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +/* + * Remove buffer from all transactions. The caller is responsible for dropping + * the jh reference that belonged to the transaction. + * + * Called with bh_state lock and j_list_lock + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) +{ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + __jbd2_journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; +} + +/** + * jbd2_journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @folio: Folio to detach data from. + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a jbd2_journal_dirty_data on this + * buffer. So we need to lock against that. jbd2_journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * Return false on failure, true on success + */ +bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) +{ + struct buffer_head *head; + struct buffer_head *bh; + bool ret = false; + + J_ASSERT(folio_test_locked(folio)); + + head = folio_buffers(folio); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * jbd2_journal_put_journal_head(). + */ + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + continue; + + spin_lock(&jh->b_state_lock); + if (!jh->b_transaction && !jh->b_next_transaction) { + spin_lock(&journal->j_list_lock); + /* Remove written-back checkpointed metadata buffer */ + if (jh->b_cp_transaction != NULL) + jbd2_journal_try_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + } + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + + ret = try_to_free_buffers(folio); +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jh->b_state_lock. + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __jbd2_journal_file_buffer + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); + } + return may_free; +} + +/* + * jbd2_journal_invalidate_folio + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidate_folio + * on the data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + + BUFFER_TRACE(bh, "entry"); + + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + goto zap_buffer_unlocked; + + /* OK, we have data buffer in journaled mode */ + write_lock(&journal->j_state_lock); + spin_lock(&jh->b_state_lock); + spin_lock(&journal->j_list_lock); + + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + __jbd2_journal_remove_checkpoint(jh); + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + may_free = __dispose_buffer(jh, + journal->j_running_transaction); + goto zap_buffer; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + may_free = __dispose_buffer(jh, + journal->j_committing_transaction); + goto zap_buffer; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + __jbd2_journal_remove_checkpoint(jh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); + /* + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + spin_unlock(&journal->j_list_lock); + spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); + /* Already zapped buffer? Nothing to do... */ + if (!bh->b_bdev) + return 0; + return -EBUSY; + } + /* + * OK, buffer won't be reachable after truncate. We just clear + * b_modified to not confuse transaction credit accounting, and + * set j_next_transaction to the running transaction (if there + * is one) and mark buffer as freed so that commit code knows + * it should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + jh->b_modified = 0; + spin_unlock(&journal->j_list_lock); + spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. + */ + jh->b_modified = 0; + spin_unlock(&journal->j_list_lock); + spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * jbd2_journal_invalidate_folio() + * @journal: journal to use for flush... + * @folio: folio to flush + * @offset: start of the range to invalidate + * @length: length of the range to invalidate + * + * Reap page buffers containing data after in the specified range in page. + * Can return -EBUSY if buffers are part of the committing transaction and + * the page is straddling i_size. Caller then has to wait for current commit + * and try again. + */ +int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, + size_t offset, size_t length) +{ + struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; + unsigned int curr_off = 0; + int partial_page = (offset || length < folio_size(folio)); + int may_free = 1; + int ret = 0; + + if (!folio_test_locked(folio)) + BUG(); + head = folio_buffers(folio); + if (!head) + return 0; + + BUG_ON(stop > folio_size(folio) || stop < length); + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (next_off > stop) + return 0; + + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + ret = journal_unmap_buffer(journal, bh, partial_page); + unlock_buffer(bh); + if (ret < 0) + return ret; + may_free &= ret; + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!partial_page) { + if (may_free && try_to_free_buffers(folio)) + J_ASSERT(!folio_buffers(folio)); + } + return 0; +} + +/* + * File a buffer on the given transaction list. + */ +void __jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = NULL; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + lockdep_assert_held(&jh->b_state_lock); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == NULL); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + spin_lock(&jh->b_state_lock); + spin_lock(&transaction->t_journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + spin_unlock(&jh->b_state_lock); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under j_list_lock + * Called under jh->b_state_lock + * + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through + * jbd2_journal_put_journal_head(). + */ +bool __jbd2_journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + + lockdep_assert_held(&jh->b_state_lock); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __jbd2_journal_unfile_buffer(jh); + return true; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __jbd2_journal_temp_unlink_buffer(jh); + + /* + * b_transaction must be set, otherwise the new b_transaction won't + * be holding jh reference + */ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ + WRITE_ONCE(jh->b_transaction, jh->b_next_transaction); + WRITE_ONCE(jh->b_next_transaction, NULL); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); + return false; +} + +/* + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. + */ +void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + bool drop; + + spin_lock(&jh->b_state_lock); + spin_lock(&journal->j_list_lock); + drop = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + spin_unlock(&journal->j_list_lock); + if (drop) + jbd2_journal_put_journal_head(jh); +} + +/* + * File inode in the inode list of the handle's transaction + */ +static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, + unsigned long flags, loff_t start_byte, loff_t end_byte) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + spin_lock(&journal->j_list_lock); + jinode->i_flags |= flags; + + if (jinode->i_dirty_end) { + jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); + jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); + } else { + jinode->i_dirty_start = start_byte; + jinode->i_dirty_end = end_byte; + } + + /* Is inode already attached where we need it? */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +int jbd2_journal_inode_ranged_write(handle_t *handle, + struct jbd2_inode *jinode, loff_t start_byte, loff_t length) +{ + return jbd2_journal_file_inode(handle, jinode, + JI_WRITE_DATA | JI_WAIT_DATA, start_byte, + start_byte + length - 1); +} + +int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode, + loff_t start_byte, loff_t length) +{ + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, + start_byte, start_byte + length - 1); +} + +/* + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). + */ +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, + loff_t new_size) +{ + transaction_t *inode_trans, *commit_trans; + int ret = 0; + + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) + goto out; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ + read_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + read_unlock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add the JBD2 journaling layer core journal management from the Linux 6.18 kernel ext4 filesystem driver. journal.c handles: - Journal initialisation and destruction - Superblock management - Journal thread (kjournald2) operations - Block-mapping for journal storage - Feature flag management - Journal format versioning - Fast-commit support infrastructure This file provides the main journal_t structure management and is the entry point for creating and destroying journals. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/jbd2/journal.c | 3162 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3162 insertions(+) create mode 100644 fs/jbd2/journal.c diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c new file mode 100644 index 00000000000..d480b94117c --- /dev/null +++ b/fs/jbd2/journal.c @@ -0,0 +1,3162 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * linux/fs/jbd2/journal.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include <linux/module.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/freezer.h> +#include <linux/pagemap.h> +#include <linux/kthread.h> +#include <linux/poison.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/math64.h> +#include <linux/hash.h> +#include <linux/log2.h> +#include <linux/vmalloc.h> +#include <linux/backing-dev.h> +#include <linux/bitops.h> +#include <linux/ratelimit.h> +#include <linux/sched/mm.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/jbd2.h> + +#include <linux/uaccess.h> +#include <asm/page.h> + +#ifdef CONFIG_JBD2_DEBUG +static ushort jbd2_journal_enable_debug __read_mostly; + +module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); +MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); +#endif + +EXPORT_SYMBOL(jbd2_journal_extend); +EXPORT_SYMBOL(jbd2_journal_stop); +EXPORT_SYMBOL(jbd2_journal_lock_updates); +EXPORT_SYMBOL(jbd2_journal_unlock_updates); +EXPORT_SYMBOL(jbd2_journal_get_write_access); +EXPORT_SYMBOL(jbd2_journal_get_create_access); +EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_set_triggers); +EXPORT_SYMBOL(jbd2_journal_dirty_metadata); +EXPORT_SYMBOL(jbd2_journal_forget); +EXPORT_SYMBOL(jbd2_journal_flush); +EXPORT_SYMBOL(jbd2_journal_revoke); + +EXPORT_SYMBOL(jbd2_journal_init_dev); +EXPORT_SYMBOL(jbd2_journal_init_inode); +EXPORT_SYMBOL(jbd2_journal_check_used_features); +EXPORT_SYMBOL(jbd2_journal_check_available_features); +EXPORT_SYMBOL(jbd2_journal_set_features); +EXPORT_SYMBOL(jbd2_journal_load); +EXPORT_SYMBOL(jbd2_journal_destroy); +EXPORT_SYMBOL(jbd2_journal_abort); +EXPORT_SYMBOL(jbd2_journal_errno); +EXPORT_SYMBOL(jbd2_journal_ack_err); +EXPORT_SYMBOL(jbd2_journal_clear_err); +EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_journal_start_commit); +EXPORT_SYMBOL(jbd2_journal_force_commit_nested); +EXPORT_SYMBOL(jbd2_journal_wipe); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); +EXPORT_SYMBOL(jbd2_journal_invalidate_folio); +EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); +EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); +EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); +EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_inode_cache); + +static int jbd2_journal_create_slab(size_t slab_size); + +#ifdef CONFIG_JBD2_DEBUG +void __jbd2_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > jbd2_journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); + va_end(args); +} +#endif + +/* Checksumming functions */ +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) +{ + __u32 csum; + __be32 old_csum; + + old_csum = sb->s_checksum; + sb->s_checksum = 0; + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); + sb->s_checksum = old_csum; + + return cpu_to_be32(csum); +} + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(struct timer_list *t) +{ + journal_t *journal = timer_container_of(journal, t, j_commit_timer); + + wake_up_process(journal->j_task); +} + +/* + * kjournald2: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. If a fast commit is ongoing + * journal thread waits until it's done and then continues from + * there on. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald2(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + timer_setup(&journal->j_commit_timer, commit_timeout, 0); + + set_freezable(); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + /* + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. + */ + memalloc_nofs_save(); + + /* + * And now, wait forever for commit wakeup events. + */ + write_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JBD2_UNMOUNT) + goto end_loop; + + jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd2_debug(1, "OK, requests differ\n"); + write_unlock(&journal->j_state_lock); + timer_delete_sync(&journal->j_commit_timer); + jbd2_journal_commit_transaction(journal); + write_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd2_debug(1, "Now suspending kjournald2\n"); + write_unlock(&journal->j_state_lock); + try_to_freeze(); + write_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + transaction = journal->j_running_transaction; + if (transaction == NULL || + time_before(jiffies, transaction->t_expires)) { + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd2_debug(1, "kjournald2 wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd2_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + timer_delete_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd2_debug(1, "Journal thread exiting.\n"); + write_unlock(&journal->j_state_lock); + return 0; +} + +static int jbd2_journal_start_thread(journal_t *journal) +{ + struct task_struct *t; + + t = kthread_run(kjournald2, journal, "jbd2/%s", + journal->j_devname); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; +} + +static void journal_kill_thread(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_UNMOUNT; + + while (journal->j_task) { + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); + wait_event(journal->j_wait_done_commit, journal->j_task == NULL); + write_lock(&journal->j_state_lock); + } + write_unlock(&journal->j_state_lock); +} + +static inline bool jbd2_data_needs_escaping(char *data) +{ + return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER); +} + +static inline void jbd2_data_do_escape(char *data) +{ + *((unsigned int *)data) = 0; +} + +/* + * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we have to make sure nobody modifies it while the + * IO is in progress. do_get_write_access() handles this. + * + * The function returns a pointer to the buffer_head to be used for IO. + * + * + * Return value: + * =0: Finished OK without escape + * =1: Finished OK with escape + */ + +int jbd2_journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct buffer_head **bh_out, + sector_t blocknr) +{ + int do_escape = 0; + struct buffer_head *new_bh; + struct folio *new_folio; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + + spin_lock(&jh_in->b_state_lock); + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + if (jh_in->b_frozen_data) { + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data); + if (do_escape) + jbd2_data_do_escape(jh_in->b_frozen_data); + } else { + char *tmp; + char *mapped_data; + + new_folio = bh_in->b_folio; + new_offset = offset_in_folio(new_folio, bh_in->b_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + /* + * Fire data frozen trigger if data already wasn't frozen. Do + * this before checking for escaping, as the trigger may modify + * the magic offset. If a copy-out happens afterwards, it will + * have the correct data in the buffer. + */ + jbd2_buffer_frozen_trigger(jh_in, mapped_data, + jh_in->b_triggers); + do_escape = jbd2_data_needs_escaping(mapped_data); + kunmap_local(mapped_data); + /* + * Do we need to do a data copy? + */ + if (!do_escape) + goto escape_done; + + spin_unlock(&jh_in->b_state_lock); + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&jh_in->b_state_lock); + if (jh_in->b_frozen_data) { + jbd2_free(tmp, bh_in->b_size); + goto copy_done; + } + + jh_in->b_frozen_data = tmp; + memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); + /* + * This isn't strictly necessary, as we're using frozen + * data for the escaping, but it keeps consistency with + * b_frozen_data usage. + */ + jh_in->b_frozen_triggers = jh_in->b_triggers; + +copy_done: + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + jbd2_data_do_escape(jh_in->b_frozen_data); + } + +escape_done: + folio_set_bh(new_bh, new_folio, new_offset); + new_bh->b_size = bh_in->b_size; + new_bh->b_bdev = journal->j_dev; + new_bh->b_blocknr = blocknr; + new_bh->b_private = bh_in; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *bh_out = new_bh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + set_buffer_shadow(bh_in); + spin_unlock(&jh_in->b_state_lock); + + return do_escape; +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * Called with j_state_lock locked for writing. + * Returns true if a transaction commit was started. + */ +static int __jbd2_log_start_commit(journal_t *journal, tid_t target) +{ + /* Return if the txn has already requested to be committed */ + if (journal->j_commit_request == target) + return 0; + + /* + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. + */ + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { + /* + * We want a new commit: OK, mark the request and wakeup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd2_debug(1, "JBD2: requesting commit %u/%u\n", + journal->j_commit_request, + journal->j_commit_sequence); + journal->j_running_transaction->t_requested = jiffies; + wake_up(&journal->j_wait_commit); + return 1; + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + return 0; +} + +int jbd2_log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + write_lock(&journal->j_state_lock); + ret = __jbd2_log_start_commit(journal, tid); + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait any uncommitted transactions. We can only force the running + * transaction if we don't have an active handle, otherwise, we will deadlock. + * Returns: <0 in case of error, + * 0 if nothing to commit, + * 1 if transaction was successfully committed. + */ +static int __jbd2_journal_force_commit(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + int need_to_start = 0, ret = 0; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + if (!tid_geq(journal->j_commit_request, transaction->t_tid)) + need_to_start = 1; + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + /* Nothing to commit */ + read_unlock(&journal->j_state_lock); + return 0; + } + tid = transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + ret = jbd2_log_wait_commit(journal, tid); + if (!ret) + ret = 1; + + return ret; +} + +/** + * jbd2_journal_force_commit_nested - Force and wait upon a commit if the + * calling process is not within transaction. + * + * @journal: journal to force + * Returns true if progress was made. + * + * This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + */ +int jbd2_journal_force_commit_nested(journal_t *journal) +{ + int ret; + + ret = __jbd2_journal_force_commit(journal); + return ret > 0; +} + +/** + * jbd2_journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * Caller want unconditional commit. We can only force the running transaction + * if we don't have an active handle, otherwise, we will deadlock. + */ +int jbd2_journal_force_commit(journal_t *journal) +{ + int ret; + + J_ASSERT(!current->journal_info); + ret = __jbd2_journal_force_commit(journal); + if (ret > 0) + ret = 0; + return ret; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid + */ +int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) + *ptid = tid; + ret = 1; + } else if (journal->j_committing_transaction) { + /* + * If commit has been started, then we have to wait for + * completion of that transaction. + */ + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans, *running_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + running_trans = journal->j_running_transaction; + /* + * The query transaction hasn't started committing, + * it must still be running. + */ + if (WARN_ON_ONCE(!running_trans || + running_trans->t_tid != tid)) + goto out; + + running_trans->t_need_data_flush = 1; + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int jbd2_log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + + read_lock(&journal->j_state_lock); +#ifdef CONFIG_PROVE_LOCKING + /* + * Some callers make sure transaction is already committing and in that + * case we cannot block on open handles anymore. So don't warn in that + * case. + */ + if (tid_gt(tid, journal->j_commit_sequence) && + (!journal->j_committing_transaction || + journal->j_committing_transaction->t_tid != tid)) { + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + read_lock(&journal->j_state_lock); + } +#endif +#ifdef CONFIG_JBD2_DEBUG + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_ERR + "%s: error: j_commit_request=%u, tid=%u\n", + __func__, journal->j_commit_request, tid); + } +#endif + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", + tid, journal->j_commit_sequence); + read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + read_lock(&journal->j_state_lock); + } + read_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) + err = -EIO; + return err; +} + +/* + * Start a fast commit. If there's an ongoing fast or full commit wait for + * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY + * if a fast commit is not needed, either because there's an already a commit + * going on or this tid has already been committed. Returns -EINVAL if no jbd2 + * commit has yet been performed. + */ +int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) +{ + if (unlikely(is_journal_aborted(journal))) + return -EIO; + /* + * Fast commits only allowed if at least one full commit has + * been processed. + */ + if (!journal->j_stats.ts_tid) + return -EINVAL; + + write_lock(&journal->j_state_lock); + if (tid_geq(journal->j_commit_sequence, tid)) { + write_unlock(&journal->j_state_lock); + return -EALREADY; + } + + if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_fc_wait, &wait); + return -EALREADY; + } + journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; + write_unlock(&journal->j_state_lock); + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_begin_commit); + +/* + * Stop a fast commit. If fallback is set, this function starts commit of + * TID tid before any other fast commit can start. + */ +static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) +{ + if (journal->j_fc_cleanup_callback) + journal->j_fc_cleanup_callback(journal, 0, tid); + write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; + if (fallback) + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_fc_wait); + if (fallback) + return jbd2_complete_transaction(journal, tid); + return 0; +} + +int jbd2_fc_end_commit(journal_t *journal) +{ + return __jbd2_fc_end_commit(journal, 0, false); +} +EXPORT_SYMBOL(jbd2_fc_end_commit); + +int jbd2_fc_end_commit_fallback(journal_t *journal) +{ + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0; + read_unlock(&journal->j_state_lock); + return __jbd2_fc_end_commit(journal, tid, true); +} +EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); + +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + return tid_geq(READ_ONCE(journal->j_commit_sequence), tid); +} +EXPORT_SYMBOL(jbd2_transaction_committed); + +/* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + +/* + * Log buffer allocation routines: + */ + +int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) +{ + unsigned long blocknr; + + write_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + write_unlock(&journal->j_state_lock); + return jbd2_journal_bmap(journal, blocknr, retp); +} + +/* Map one fast commit buffer for use by the file system */ +int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) +{ + unsigned long long pblock; + unsigned long blocknr; + int ret = 0; + struct buffer_head *bh; + int fc_off; + + *bh_out = NULL; + + if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last) + return -EINVAL; + + fc_off = journal->j_fc_off; + blocknr = journal->j_fc_first + fc_off; + journal->j_fc_off++; + ret = jbd2_journal_bmap(journal, blocknr, &pblock); + if (ret) + return ret; + + bh = __getblk(journal->j_dev, pblock, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + journal->j_fc_wbuf[fc_off] = bh; + + *bh_out = bh; + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_get_buf); + +/* + * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf + * for completion. + */ +int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) +{ + struct buffer_head *bh; + int i, j_fc_off; + + j_fc_off = journal->j_fc_off; + + /* + * Wait in reverse order to minimize chances of us being woken up before + * all IOs have completed + */ + for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { + bh = journal->j_fc_wbuf[i]; + wait_on_buffer(bh); + /* + * Update j_fc_off so jbd2_fc_release_bufs can release remain + * buffer head. + */ + if (unlikely(!buffer_uptodate(bh))) { + journal->j_fc_off = i + 1; + return -EIO; + } + put_bh(bh); + journal->j_fc_wbuf[i] = NULL; + } + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_wait_bufs); + +void jbd2_fc_release_bufs(journal_t *journal) +{ + struct buffer_head *bh; + int i, j_fc_off; + + j_fc_off = journal->j_fc_off; + + for (i = j_fc_off - 1; i >= 0; i--) { + bh = journal->j_fc_wbuf[i]; + if (!bh) + break; + put_bh(bh); + journal->j_fc_wbuf[i] = NULL; + } +} +EXPORT_SYMBOL(jbd2_fc_release_bufs); + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, + unsigned long long *retp) +{ + int err = 0; + unsigned long long ret; + sector_t block = blocknr; + + if (journal->j_bmap) { + err = journal->j_bmap(journal, &block); + if (err == 0) + *retp = block; + } else if (journal->j_inode) { + ret = bmap(journal->j_inode, &block); + + if (ret || !block) { + printk(KERN_ALERT "%s: journal block not found " + "at offset %lu on %s\n", + __func__, blocknr, journal->j_devname); + err = -EIO; + jbd2_journal_abort(journal, err); + } else { + *retp = block; + } + + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct buffer_head * +jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) +{ + journal_t *journal = transaction->t_journal; + struct buffer_head *bh; + unsigned long long blocknr; + journal_header_t *header; + int err; + + err = jbd2_journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; + atomic_dec(&transaction->t_outstanding_credits); + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + header = (journal_header_t *)bh->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(type); + header->h_sequence = cpu_to_be32(transaction->t_tid); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return bh; +} + +void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + int ret; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); + if (ret) + goto out; + + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd2_debug(1, + "Cleaning journal tail from %u to %u (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); + +out: + return ret; +} + +/* + * This is a variation of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + mutex_lock_io(&journal->j_checkpoint_mutex); + if (tid_gt(tid, journal->j_tail_sequence)) + __jbd2_update_log_tail(journal, tid, block); + mutex_unlock(&journal->j_checkpoint_mutex); +} + +struct jbd2_stats_proc_session { + journal_t *journal; + struct transaction_stats_s *stats; + int start; + int max; +}; + +static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static int jbd2_seq_info_show(struct seq_file *seq, void *v) +{ + struct jbd2_stats_proc_session *s = seq->private; + + if (v != SEQ_START_TOKEN) + return 0; + seq_printf(seq, "%lu transactions (%lu requested), " + "each up to %u blocks\n", + s->stats->ts_tid, s->stats->ts_requested, + s->journal->j_max_transaction_buffers); + if (s->stats->ts_tid == 0) + return 0; + seq_printf(seq, "average: \n %ums waiting for transaction\n", + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums request delay\n", + (s->stats->ts_requested == 0) ? 0 : + jiffies_to_msecs(s->stats->run.rs_request_delay / + s->stats->ts_requested)); + seq_printf(seq, " %ums running transaction\n", + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); + seq_printf(seq, " %ums transaction was being locked\n", + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); + seq_printf(seq, " %ums flushing data (in ordered mode)\n", + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); + seq_printf(seq, " %ums logging transaction\n", + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %lluus average transaction commit time\n", + div_u64(s->journal->j_average_commit_time, 1000)); + seq_printf(seq, " %lu handles per transaction\n", + s->stats->run.rs_handle_count / s->stats->ts_tid); + seq_printf(seq, " %lu blocks per transaction\n", + s->stats->run.rs_blocks / s->stats->ts_tid); + seq_printf(seq, " %lu logged blocks per transaction\n", + s->stats->run.rs_blocks_logged / s->stats->ts_tid); + return 0; +} + +static void jbd2_seq_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations jbd2_seq_info_ops = { + .start = jbd2_seq_info_start, + .next = jbd2_seq_info_next, + .stop = jbd2_seq_info_stop, + .show = jbd2_seq_info_show, +}; + +static int jbd2_seq_info_open(struct inode *inode, struct file *file) +{ + journal_t *journal = pde_data(inode); + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s); + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, &journal->j_stats, size); + s->journal = journal; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_info_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static const struct proc_ops jbd2_info_proc_ops = { + .proc_open = jbd2_seq_info_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = jbd2_seq_info_release, +}; + +static struct proc_dir_entry *proc_jbd2_stats; + +static void jbd2_stats_proc_init(journal_t *journal) +{ + journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); + if (journal->j_proc_entry) { + proc_create_data("info", S_IRUGO, journal->j_proc_entry, + &jbd2_info_proc_ops, journal); + } +} + +static void jbd2_stats_proc_exit(journal_t *journal) +{ + remove_proc_entry("info", journal->j_proc_entry); + remove_proc_entry(journal->j_devname, proc_jbd2_stats); +} + +/* Minimum size of descriptor tag */ +static int jbd2_min_tag_size(void) +{ + /* + * Tag with 32-bit block numbers does not use last four bytes of the + * structure + */ + return sizeof(journal_block_tag_t) - 4; +} + +/** + * jbd2_journal_shrink_scan() + * @shrink: shrinker to work on + * @sc: reclaim request to process + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = shrink->private_data; + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * @shrink: shrinker to work on + * @sc: reclaim request to process + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = shrink->private_data; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock(journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Check the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_check_superblock(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + int num_fc_blks; + int err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + return err; + } + + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + return err; + } + + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + return err; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_total_len) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + return err; + } + + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); + return err; + } + + num_fc_blks = jbd2_has_feature_fast_commit(journal) ? + jbd2_journal_get_num_fc_blks(sb) : 0; + if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || + be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { + printk(KERN_ERR "JBD2: journal file too short %u,%d\n", + be32_to_cpu(sb->s_maxlen), num_fc_blks); + return err; + } + + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + return err; + } + + if (jbd2_journal_has_csum_v2or3(journal) && + jbd2_has_feature_checksum(journal)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + return err; + } + + if (jbd2_journal_has_csum_v2or3(journal)) { + if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + return err; + } + + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + return err; + } + } + + return 0; +} + +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + +static int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 3; +} + +/* + * Base amount of descriptor blocks we reserve for each transaction. + */ +static int jbd2_descriptor_blocks_per_trans(journal_t *journal) +{ + int tag_space = journal->j_blocksize - sizeof(journal_header_t); + int tags_per_block; + + /* Subtract UUID */ + tag_space -= 16; + if (jbd2_journal_has_csum_v2or3(journal)) + tag_space -= sizeof(struct jbd2_journal_block_tail); + /* Commit code leaves a slack space of 16 bytes at the end of block */ + tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); + /* + * Revoke descriptors are accounted separately so we need to reserve + * space for commit block and normal transaction descriptor blocks. + */ + return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal), + tags_per_block); +} + +/* + * Initialize number of blocks each transaction reserves for its bookkeeping + * and maximum number of blocks a transaction can use. This needs to be called + * after the journal size and the fastcommit area size are initialized. + */ +static void jbd2_journal_init_transaction_limits(journal_t *journal) +{ + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + journal->j_transaction_overhead_buffers = + jbd2_descriptor_blocks_per_trans(journal); + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ +static int journal_load_superblock(journal_t *journal) +{ + int err; + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, + journal->j_blocksize); + if (bh) + err = bh_read(bh, 0); + if (!bh || err < 0) { + pr_err("%s: Cannot read journal superblock\n", __func__); + brelse(bh); + return -EIO; + } + + journal->j_sb_buffer = bh; + sb = (journal_superblock_t *)bh->b_data; + journal->j_superblock = sb; + err = journal_check_superblock(journal); + if (err) { + journal_fail_superblock(journal); + return err; + } + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); + + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, + sizeof(sb->s_uuid)); + /* After journal features are set, we can compute transaction limits */ + jbd2_journal_init_transaction_limits(journal); + + if (jbd2_has_feature_fast_commit(journal)) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + journal->j_last = journal->j_fc_last - + jbd2_journal_get_num_fc_blks(sb); + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } + + return 0; +} + + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* The journal_init_common() function creates and fills a journal_t object + * in memory. It calls journal_load_superblock() to load the on-disk journal + * superblock and initialize the journal_t object. + */ + +static journal_t *journal_init_common(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) +{ + static struct lock_class_key jbd2_trans_commit_key; + journal_t *journal; + int err; + int n; + + journal = kzalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + return ERR_PTR(-ENOMEM); + + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_total_len = len; + jbd2_init_fs_dev_write_error(journal); + + err = journal_load_superblock(journal); + if (err) + goto err_cleanup; + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + init_waitqueue_head(&journal->j_wait_reserved); + init_waitqueue_head(&journal->j_fc_wait); + mutex_init(&journal->j_abort_mutex); + mutex_init(&journal->j_barrier); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_history_lock); + rwlock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ + atomic_set(&journal->j_reserved_credits, 0); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JBD2_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) + goto err_cleanup; + + /* + * journal descriptor can store up to n blocks, we need enough + * buffers to write out full descriptor block. + */ + err = -ENOMEM; + n = journal->j_blocksize / jbd2_min_tag_size(); + journal->j_wbufsize = n; + journal->j_fc_wbuf = NULL; + journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!journal->j_wbuf) + goto err_cleanup; + + err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, + GFP_KERNEL); + if (err) + goto err_cleanup; + + journal->j_shrink_transaction = NULL; + + journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!journal->j_shrinker) { + err = -ENOMEM; + goto err_cleanup; + } + + journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker->count_objects = jbd2_journal_shrink_count; + journal->j_shrinker->private_data = journal; + + shrinker_register(journal->j_shrinker); + + return journal; + +err_cleanup: + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + kfree(journal->j_wbuf); + jbd2_journal_destroy_revoke(journal); + journal_fail_superblock(journal); + kfree(journal); + return ERR_PTR(err); +} + +/* jbd2_journal_init_dev and jbd2_journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Length of the journal in blocks. + * @blocksize: blocksize of journalling device + * + * Returns: a newly created journal_t * + * + * jbd2_journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t *jbd2_journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) +{ + journal_t *journal; + + journal = journal_init_common(bdev, fs_dev, start, len, blocksize); + if (IS_ERR(journal)) + return ERR_CAST(journal); + + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); + strreplace(journal->j_devname, '/', '!'); + jbd2_stats_proc_init(journal); + + return journal; +} + +/** + * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * jbd2_journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t *jbd2_journal_init_inode(struct inode *inode) +{ + journal_t *journal; + sector_t blocknr; + int err = 0; + + blocknr = 0; + err = bmap(inode, &blocknr); + if (err || !blocknr) { + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); + } + + jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, + blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, + inode->i_sb->s_blocksize); + if (IS_ERR(journal)) + return ERR_CAST(journal); + + journal->j_inode = inode; + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg-%lu", journal->j_dev, journal->j_inode->i_ino); + strreplace(journal->j_devname, '/', '!'); + jbd2_stats_proc_init(journal); + + return journal; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } + + journal->j_first = first; + journal->j_last = last; + + if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) { + /* + * Disable the cycled recording mode if the journal head block + * number is not correct. + */ + if (journal->j_head < first || journal->j_head >= last) { + printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, " + "disable journal_cycle_record\n", + journal->j_head); + journal->j_head = journal->j_first; + } + } else { + journal->j_head = journal->j_first; + } + journal->j_tail = journal->j_head; + journal->j_free = journal->j_last - journal->j_first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + /* + * Now that journal recovery is done, turn fast commits off here. This + * way, if fast commit was enabled before the crash but if now FS has + * disabled it, we don't enable fast commits. + */ + jbd2_clear_feature_fast_commit(journal); + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JBD2_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb " + "(start %ld, seq %u, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JBD2_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock_io(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use REQ_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, REQ_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } + return jbd2_journal_start_thread(journal); +} + +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ +static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) +{ + struct buffer_head *bh = journal->j_sb_buffer; + journal_superblock_t *sb = journal->j_superblock; + int ret = 0; + + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(bh)) { + unlock_buffer(bh); + return -EIO; + } + + /* + * Always set high priority flags to exempt from block layer's + * QOS policies, e.g. writeback throttle. + */ + write_flags |= JBD2_JOURNAL_REQ_FLAGS; + if (!(journal->j_flags & JBD2_BARRIER)) + write_flags &= ~(REQ_FUA | REQ_PREFLUSH); + + trace_jbd2_write_superblock(journal, write_flags); + + if (buffer_write_io_error(bh)) { + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD2: previous I/O error detected " + "for journal superblock update for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(sb); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE | write_flags, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n", + journal->j_devname); + if (!is_journal_aborted(journal)) + jbd2_journal_abort(journal, ret); + } + + return ret; +} + +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_flags: Flags for the journal sb write operation + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, + blk_opf_t write_flags) +{ + journal_superblock_t *sb = journal->j_superblock; + int ret; + + if (is_journal_aborted(journal)) + return -EIO; + if (jbd2_check_fs_dev_write_error(journal)) { + jbd2_journal_abort(journal, -EIO); + return -EIO; + } + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); + + lock_buffer(journal->j_sb_buffer); + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + ret = jbd2_write_superblock(journal, write_flags); + if (ret) + goto out; + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); + +out: + return ret; +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * @write_flags: Flags for the journal sb write operation + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) +{ + journal_superblock_t *sb = journal->j_superblock; + bool had_fast_commit = false; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); + return; + } + + jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n", + journal->j_tail_sequence); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(0); + sb->s_head = cpu_to_be32(journal->j_head); + if (jbd2_has_feature_fast_commit(journal)) { + /* + * When journal is clean, no need to commit fast commit flag and + * make file system incompatible with older kernels. + */ + jbd2_clear_feature_fast_commit(journal); + had_fast_commit = true; + } + + jbd2_write_superblock(journal, write_flags); + + if (had_fast_commit) + jbd2_set_feature_fast_commit(journal); + + /* Log is empty */ + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/** + * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) + * @journal: The journal to erase. + * @flags: A discard/zeroout request is sent for each physically contigous + * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or + * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation + * to perform. + * + * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes + * will be explicitly written if no hardware offload is available, see + * blkdev_issue_zeroout for more details. + */ +static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) +{ + int err = 0; + unsigned long block, log_offset; /* logical */ + unsigned long long phys_block, block_start, block_stop; /* physical */ + loff_t byte_start, byte_stop, byte_count; + + /* flags must be set to either discard or zeroout */ + if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(journal->j_dev)) + return -EOPNOTSUPP; + + /* + * lookup block mapping and issue discard/zeroout for each + * contiguous region + */ + log_offset = be32_to_cpu(journal->j_superblock->s_first); + block_start = ~0ULL; + for (block = log_offset; block < journal->j_total_len; block++) { + err = jbd2_journal_bmap(journal, block, &phys_block); + if (err) { + pr_err("JBD2: bad block at offset %lu", block); + return err; + } + + if (block_start == ~0ULL) + block_stop = block_start = phys_block; + + /* + * last block not contiguous with current block, + * process last contiguous region and return to this block on + * next loop + */ + if (phys_block != block_stop) { + block--; + } else { + block_stop++; + /* + * if this isn't the last block of journal, + * no need to process now because next block may also + * be part of this contiguous region + */ + if (block != journal->j_total_len - 1) + continue; + } + + /* + * end of contiguous region or this is last block of journal, + * take care of the region + */ + byte_start = block_start * journal->j_blocksize; + byte_stop = block_stop * journal->j_blocksize; + byte_count = (block_stop - block_start) * journal->j_blocksize; + + truncate_inode_pages_range(journal->j_dev->bd_mapping, + byte_start, byte_stop - 1); + + if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { + err = blkdev_issue_discard(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS); + } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { + err = blkdev_issue_zeroout(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } + + if (unlikely(err != 0)) { + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", + err, block_start, block_stop); + return err; + } + + /* reset start and stop after processing a region */ + block_start = ~0ULL; + } + + return blkdev_issue_flush(journal->j_dev); +} + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +void jbd2_journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + int errcode; + + lock_buffer(journal->j_sb_buffer); + errcode = journal->j_errno; + if (errcode == -ESHUTDOWN) + errcode = 0; + jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); + sb->s_errno = cpu_to_be32(errcode); + + jbd2_write_superblock(journal, REQ_FUA); +} +EXPORT_SYMBOL(jbd2_journal_update_sb_errno); + +/** + * jbd2_journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int jbd2_journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb = journal->j_superblock; + + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + err = jbd2_journal_recover(journal); + if (err) { + pr_warn("JBD2: journal recovery failed\n"); + return err; + } + + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EFSCORRUPTED; + } + /* + * clear JBD2_ABORT flag initialized in journal_init_common + * here to update log tail information with the newest seq. + */ + journal->j_flags &= ~JBD2_ABORT; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + err = journal_reset(journal); + if (err) { + pr_warn("JBD2: journal reset failed\n"); + return err; + } + + journal->j_flags |= JBD2_LOADED; + return 0; +} + +/** + * jbd2_journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + * Return <0 if we couldn't clean up the journal. + */ +int jbd2_journal_destroy(journal_t *journal) +{ + int err = 0; + + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + jbd2_journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock_io(&journal->j_checkpoint_mutex); + err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + /* + * If checkpointing failed, just free the buffers to avoid + * looping forever + */ + if (err) { + jbd2_journal_destroy_checkpoint(journal); + spin_lock(&journal->j_list_lock); + break; + } + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + /* + * OK, all checkpoint transactions have been checked, now check the + * writeback errseq of fs dev and abort the journal if some buffer + * failed to write back to the original location, otherwise the + * filesystem may become inconsistent. + */ + if (!is_journal_aborted(journal) && + jbd2_check_fs_dev_write_error(journal)) + jbd2_journal_abort(journal, -EIO); + + if (journal->j_sb_buffer) { + if (!is_journal_aborted(journal)) { + mutex_lock_io(&journal->j_checkpoint_mutex); + + write_lock(&journal->j_state_lock); + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + write_unlock(&journal->j_state_lock); + + jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } else + err = -EIO; + brelse(journal->j_sb_buffer); + } + + if (journal->j_shrinker) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + shrinker_free(journal->j_shrinker); + } + if (journal->j_proc_entry) + jbd2_stats_proc_exit(journal); + iput(journal->j_inode); + if (journal->j_revoke) + jbd2_journal_destroy_revoke(journal); + kfree(journal->j_fc_wbuf); + kfree(journal->j_wbuf); + kfree(journal); + + return err; +} + + +/** + * jbd2_journal_check_used_features() - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (!jbd2_format_support_feature(journal)) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * jbd2_journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + if (!compat && !ro && !incompat) + return 1; + + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && + (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long num_fc_blks; + + num_fc_blks = jbd2_journal_get_num_fc_blks(sb); + if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) + return -ENOSPC; + + /* Are we called twice? */ + WARN_ON(journal->j_fc_wbuf != NULL); + journal->j_fc_wbuf = kmalloc_array(num_fc_blks, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_last = journal->j_last; + journal->j_last = journal->j_fc_last - num_fc_blks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + journal->j_free = journal->j_last - journal->j_first; + + return 0; +} + +/** + * jbd2_journal_set_features() - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int jbd2_journal_set_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ +#define INCOMPAT_FEATURE_ON(f) \ + ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) +#define COMPAT_FEATURE_ON(f) \ + ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) + journal_superblock_t *sb; + + if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + /* If enabling v2 checksums, turn on v3 instead */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; + } + + /* Asking for checksumming v3 and v1? Only give them v3. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && + compat & JBD2_FEATURE_COMPAT_CHECKSUM) + compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; + + jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { + if (jbd2_journal_initialize_fast_commit(journal)) { + pr_err("JBD2: Cannot enable fast commits.\n"); + return 0; + } + } + + lock_buffer(journal->j_sb_buffer); + + /* If enabling v3 checksums, update superblock and precompute seed */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + sb->s_checksum_type = JBD2_CRC32C_CHKSUM; + sb->s_feature_compat &= + ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + /* If enabling v1 checksums, downgrade superblock */ + if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) + sb->s_feature_incompat &= + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | + JBD2_FEATURE_INCOMPAT_CSUM_V3); + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); + jbd2_journal_init_transaction_limits(journal); + + return 1; +#undef COMPAT_FEATURE_ON +#undef INCOMPAT_FEATURE_ON +} + +/* + * jbd2_journal_clear_features() - Clear a given journal feature in the + * superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Clear a given journal feature as present on the + * superblock. + */ +void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat &= ~cpu_to_be32(compat); + sb->s_feature_ro_compat &= ~cpu_to_be32(ro); + sb->s_feature_incompat &= ~cpu_to_be32(incompat); + jbd2_journal_init_transaction_limits(journal); +} +EXPORT_SYMBOL(jbd2_journal_clear_features); + +/** + * jbd2_journal_flush() - Flush journal + * @journal: Journal to act on. + * @flags: optional operation on the journal blocks after the flush (see below) + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. Optionally, a discard or zeroout + * can be issued on the journal blocks after flushing. + * + * flags: + * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks + * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks + */ +int jbd2_journal_flush(journal_t *journal, unsigned int flags) +{ + int err = 0; + transaction_t *transaction = NULL; + + write_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __jbd2_log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + write_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); + } else { + write_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock_io(&journal->j_checkpoint_mutex); + err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + mutex_lock_io(&journal->j_checkpoint_mutex); + if (!err) { + err = jbd2_cleanup_journal_tail(journal); + if (err < 0) { + mutex_unlock(&journal->j_checkpoint_mutex); + goto out; + } + err = 0; + } + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + jbd2_mark_journal_empty(journal, REQ_FUA); + + if (flags) + err = __jbd2_journal_erase(journal, flags); + + mutex_unlock(&journal->j_checkpoint_mutex); + write_lock(&journal->j_state_lock); + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + write_unlock(&journal->j_state_lock); +out: + return err; +} + +/** + * jbd2_journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and jbd2_journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int jbd2_journal_wipe(journal_t *journal, int write) +{ + int err; + + J_ASSERT (!(journal->j_flags & JBD2_LOADED)); + + if (!journal->j_tail) + return 0; + + printk(KERN_WARNING "JBD2: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = jbd2_journal_skip_recovery(journal); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock_io(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal, REQ_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } + + return err; +} + +/** + * jbd2_journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The jbd2_journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * jbd2_journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final jbd2_journal_stop, which will receive the -EIO error. + * + * Finally, the jbd2_journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + */ + +void jbd2_journal_abort(journal_t *journal, int errno) +{ + transaction_t *transaction; + + /* + * Lock the aborting procedure until everything is done, this avoid + * races between filesystem's error handling flow (e.g. ext4_abort()), + * ensure panic after the error info is written into journal's + * superblock. + */ + mutex_lock(&journal->j_abort_mutex); + /* + * ESHUTDOWN always takes precedence because a file system check + * caused by any other journal abort error is not required after + * a shutdown triggered. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + int old_errno = journal->j_errno; + + write_unlock(&journal->j_state_lock); + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { + journal->j_errno = errno; + jbd2_journal_update_sb_errno(journal); + } + mutex_unlock(&journal->j_abort_mutex); + return; + } + + /* + * Mark the abort as occurred and start current running transaction + * to release all journaled buffer. + */ + pr_err("Aborting journal on device %s.\n", journal->j_devname); + + journal->j_flags |= JBD2_ABORT; + journal->j_errno = errno; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); + + /* + * Record errno to the journal super block, so that fsck and jbd2 + * layer could realise that a filesystem check is needed. + */ + jbd2_journal_update_sb_errno(journal); + mutex_unlock(&journal->j_abort_mutex); +} + +/** + * jbd2_journal_errno() - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno number set with jbd2_journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int jbd2_journal_errno(journal_t *journal) +{ + int err; + + read_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + err = journal->j_errno; + read_unlock(&journal->j_state_lock); + return err; +} + +/** + * jbd2_journal_clear_err() - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +int jbd2_journal_clear_err(journal_t *journal) +{ + int err = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + write_unlock(&journal->j_state_lock); + return err; +} + +/** + * jbd2_journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +void jbd2_journal_ack_err(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JBD2_ACK_ERR; + write_unlock(&journal->j_state_lock); +} + +int jbd2_journal_blocks_per_folio(struct inode *inode) +{ + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); +} + +/* + * helper functions to deal with 32 or 64bit block numbers. + */ +size_t journal_tag_bytes(journal_t *journal) +{ + size_t sz; + + if (jbd2_has_feature_csum3(journal)) + return sizeof(journal_block_tag3_t); + + sz = sizeof(journal_block_tag_t); + + if (jbd2_has_feature_csum2(journal)) + sz += sizeof(__u16); + + if (jbd2_has_feature_64bit(journal)) + return sz; + else + return sz - sizeof(__u32); +} + +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + static DEFINE_MUTEX(jbd2_slab_create_mutex); + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + mutex_lock(&jbd2_slab_create_mutex); + if (jbd2_slab[i]) { + mutex_unlock(&jbd2_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + mutex_unlock(&jbd2_slab_create_mutex); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == NULL); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + if (size < PAGE_SIZE) + ptr = kmem_cache_alloc(get_slab(size), flags); + else + ptr = (void *)__get_free_pages(flags, get_order(size)); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size < PAGE_SIZE) + kmem_cache_free(get_slab(size), ptr); + else + free_pages((unsigned long)ptr, get_order(size)); +}; + +/* + * Journal_head storage management + */ +static struct kmem_cache *jbd2_journal_head_cache; +#ifdef CONFIG_JBD2_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int __init jbd2_journal_init_journal_head_cache(void) +{ + J_ASSERT(!jbd2_journal_head_cache); + jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", + sizeof(struct journal_head), + 0, /* offset */ + SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, + NULL); /* ctor */ + if (!jbd2_journal_head_cache) { + printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); + return -ENOMEM; + } + return 0; +} + +static void jbd2_journal_destroy_journal_head_cache(void) +{ + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + +#ifdef CONFIG_JBD2_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); + if (!ret) { + jbd2_debug(1, "out of memory for journal_head\n"); + pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); + } + spin_lock_init(&ret->b_state_lock); + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD2_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD2_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(jbd2_journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, jbd2_journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * jbd2_journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = jbd2_journal_add_journal_head(bh); + * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * jbd2_journal_put_journal_head(jh); + */ + +/* + * Give a buffer_head a journal_head. + * + * May sleep. + */ +struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) + new_jh = journal_alloc_journal_head(); + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_folio && bh->b_folio->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} +EXPORT_SYMBOL(jbd2_journal_grab_journal_head); + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + + /* Unlink before dropping the lock */ + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); +} + +static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +{ + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, b_size); + } + journal_free_journal_head(jh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then + * release the journal_head from the buffer_head. + */ +void jbd2_journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount) { + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); + journal_release_journal_head(jh, bh->b_size); + __brelse(bh); + } else { + jbd_unlock_bh_journal_head(bh); + } +} +EXPORT_SYMBOL(jbd2_journal_put_journal_head); + +/* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + jinode->i_dirty_start = 0; + jinode->i_dirty_end = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (jinode->i_flags & JI_COMMIT_RUNNING) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto restart; + } + + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + + +#ifdef CONFIG_PROC_FS + +#define JBD2_STATS_PROC_NAME "fs/jbd2" + +static void __init jbd2_create_jbd_stats_proc_entry(void) +{ + proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); +} + +static void __exit jbd2_remove_jbd_stats_proc_entry(void) +{ + if (proc_jbd2_stats) + remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); +} + +#else + +#define jbd2_create_jbd_stats_proc_entry() do {} while (0) +#define jbd2_remove_jbd_stats_proc_entry() do {} while (0) + +#endif + +struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; + +static int __init jbd2_journal_init_inode_cache(void) +{ + J_ASSERT(!jbd2_inode_cache); + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (!jbd2_inode_cache) { + pr_emerg("JBD2: failed to create inode cache\n"); + return -ENOMEM; + } + return 0; +} + +static int __init jbd2_journal_init_handle_cache(void) +{ + J_ASSERT(!jbd2_handle_cache); + jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); + if (!jbd2_handle_cache) { + printk(KERN_EMERG "JBD2: failed to create handle cache\n"); + return -ENOMEM; + } + return 0; +} + +static void jbd2_journal_destroy_inode_cache(void) +{ + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; +} + +static void jbd2_journal_destroy_handle_cache(void) +{ + kmem_cache_destroy(jbd2_handle_cache); + jbd2_handle_cache = NULL; +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = jbd2_journal_init_revoke_record_cache(); + if (ret == 0) + ret = jbd2_journal_init_revoke_table_cache(); + if (ret == 0) + ret = jbd2_journal_init_journal_head_cache(); + if (ret == 0) + ret = jbd2_journal_init_handle_cache(); + if (ret == 0) + ret = jbd2_journal_init_inode_cache(); + if (ret == 0) + ret = jbd2_journal_init_transaction_cache(); + return ret; +} + +static void jbd2_journal_destroy_caches(void) +{ + jbd2_journal_destroy_revoke_record_cache(); + jbd2_journal_destroy_revoke_table_cache(); + jbd2_journal_destroy_journal_head_cache(); + jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_inode_cache(); + jbd2_journal_destroy_transaction_cache(); + jbd2_journal_destroy_slabs(); +} + +static int __init journal_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); + + ret = journal_init_caches(); + if (ret == 0) { + jbd2_create_jbd_stats_proc_entry(); + } else { + jbd2_journal_destroy_caches(); + } + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD2_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); +#endif + jbd2_remove_jbd_stats_proc_entry(); + jbd2_journal_destroy_caches(); +} + +MODULE_DESCRIPTION("Generic filesystem journal-writing module"); +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Update checkpoint.c includes to use ext4_uboot.h compatibility layer. Add jbd2/Makefile and include jbd2 in the build via fs/Makefile Add necessary stubs and definitions: - JBD2 trace stubs (trace_jbd2_checkpoint, etc.) - mutex_lock_io, write_dirty_buffer, spin_needbreak stubs - bd_dev field to struct block_device - Temporary JBD2 function stubs until other jbd2 files are added Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/Makefile | 1 + fs/ext4l/ext4_uboot.h | 14 ++++++++++++++ fs/ext4l/stub.c | 33 +++++++++++++++++++++++++++++++++ fs/jbd2/Makefile | 6 ++++++ fs/jbd2/checkpoint.c | 7 +------ include/linux/fs.h | 1 + 6 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 fs/jbd2/Makefile diff --git a/fs/Makefile b/fs/Makefile index a7d5df10424..b6d4f7a9cf7 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_CMD_CRAMFS) += cramfs/ obj-$(CONFIG_FS_EXFAT) += exfat/ obj-$(CONFIG_FS_EXT4) += ext4/ obj-$(CONFIG_FS_EXT4L) += ext4l/ +obj-$(CONFIG_FS_EXT4L) += jbd2/ obj-$(CONFIG_FS_FAT) += fat/ obj-$(CONFIG_FS_JFFS2) += jffs2/ obj-$(CONFIG_SANDBOX) += sandbox/ diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 82b68056953..62815e334c8 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2768,4 +2768,18 @@ struct wait_bit_entry { /* get_current_ioprio - I/O priority (not used in U-Boot) */ #define get_current_ioprio() (0) +/* JBD2 checkpoint.c stubs */ +#define mutex_lock_io(m) mutex_lock(m) +#define write_dirty_buffer(bh, flags) sync_dirty_buffer(bh) +#define spin_needbreak(l) ({ (void)(l); 0; }) + +/* JBD2 trace stubs */ +#define trace_jbd2_checkpoint(j, r) do { (void)(j); (void)(r); } while (0) +#define trace_jbd2_shrink_checkpoint_list(j, f, t, l, n, d) \ + do { (void)(j); (void)(f); (void)(t); (void)(l); (void)(n); (void)(d); } while (0) +#define trace_jbd2_checkpoint_stats(d, tid, stats) \ + do { (void)(d); (void)(tid); (void)(stats); } while (0) +#define trace_jbd2_drop_transaction(j, t) \ + do { (void)(j); (void)(t); } while (0) + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index 2b2b777d1f2..0458191c496 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -827,3 +827,36 @@ int dquot_alloc_block(struct inode *inode, loff_t nr) void dquot_free_block(struct inode *inode, loff_t nr) { } + +/* + * JBD2 stubs - temporary stubs until other jbd2 files are added + * Note: These use void* to avoid pulling in jbd2.h types which would conflict + */ + +int jbd2_journal_get_log_tail(void *journal, void *tid, unsigned long *block) +{ + return 0; +} + +int __jbd2_update_log_tail(void *journal, unsigned int tid, unsigned long block) +{ + return 0; +} + +void *jbd2_journal_grab_journal_head(struct buffer_head *bh) +{ + return NULL; +} + +void jbd2_journal_put_journal_head(void *jh) +{ +} + +void jbd2_journal_free_transaction(void *transaction) +{ +} + +int jbd2_log_start_commit(void *journal, unsigned int tid) +{ + return 0; +} diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile new file mode 100644 index 00000000000..03e8ba0f2ec --- /dev/null +++ b/fs/jbd2/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for JBD2 journaling layer (used by ext4l) +# + +obj-y := checkpoint.o diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2d0719bf6d8..994e454bb5b 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -14,13 +14,8 @@ * reused. */ -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <trace/events/jbd2.h> /* * Unlink a buffer from a transaction checkpoint list. diff --git a/include/linux/fs.h b/include/linux/fs.h index ae34b1f7356..ef28c12c022 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -65,6 +65,7 @@ struct block_device { struct address_space *bd_mapping; void *bd_disk; struct super_block *bd_super; + dev_t bd_dev; }; /* errseq functions - stubs */ -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add journal.c to the jbd2 Makefile and update includes to use the ext4l compatibility layer. Add stubs for functions from recovery.c, revoke.c, and transaction.c that journal.c depends on. Remove JBD2 function stubs from stub.c that are now provided by journal.c. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/ext4_uboot.h | 106 ++++++++++++++++++++++- fs/ext4l/stub.c | 190 +++++++++++++++++++----------------------- fs/jbd2/Makefile | 2 +- fs/jbd2/commit.c | 15 +--- fs/jbd2/journal.c | 28 +------ 5 files changed, 193 insertions(+), 148 deletions(-) diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 62815e334c8..0d24940e74e 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -126,6 +126,12 @@ struct kobject { const char *name; }; +/* lockdep stubs - needed before jbd2.h is included */ +struct lockdep_map { int dummy; }; +struct lock_class_key { int dummy; }; +#define rwsem_acquire(l, s, t, i) do { } while (0) +#define rwsem_release(l, i) do { } while (0) + /* completion - stub */ struct completion { unsigned int done; @@ -1211,8 +1217,11 @@ struct folio_batch { /* folio operations - stubs */ #define folio_mark_dirty(f) do { (void)(f); } while (0) -#define offset_in_folio(f, p) ({ (void)(f); (unsigned int)((p) & (PAGE_SIZE - 1)); }) +#define offset_in_folio(f, p) ({ (void)(f); (unsigned int)((unsigned long)(p) & (PAGE_SIZE - 1)); }) #define folio_buffers(f) ({ (void)(f); (struct buffer_head *)NULL; }) +#define virt_to_folio(p) ({ (void)(p); (struct folio *)NULL; }) +#define folio_set_bh(bh, f, off) do { (void)(bh); (void)(f); (void)(off); } while (0) +#define memcpy_from_folio(dst, f, off, len) do { (void)(dst); (void)(f); (void)(off); (void)(len); } while (0) #define folio_test_uptodate(f) ({ (void)(f); 1; }) #define folio_pos(f) ({ (void)(f); 0LL; }) #define folio_size(f) ({ (void)(f); PAGE_SIZE; }) @@ -2782,4 +2791,99 @@ struct wait_bit_entry { #define trace_jbd2_drop_transaction(j, t) \ do { (void)(j); (void)(t); } while (0) +/* JBD2 commit.c stubs */ +#define clear_bit_unlock(nr, addr) clear_bit(nr, addr) +#define smp_mb__after_atomic() do { } while (0) +#define folio_trylock(f) ({ (void)(f); 1; }) +#define ktime_get_coarse_real_ts64(ts) do { (ts)->tv_sec = 0; (ts)->tv_nsec = 0; } while (0) +#define filemap_fdatawait_range_keep_errors(m, s, e) \ + ({ (void)(m); (void)(s); (void)(e); 0; }) +#define crc32_be(crc, p, len) crc32(crc, p, len) +#define free_buffer_head(bh) kfree(bh) +#define sb_is_blkdev_sb(sb) ({ (void)(sb); 0; }) + +/* DEFINE_WAIT stub - creates a wait queue entry */ +#define DEFINE_WAIT(name) int name = 0 + +/* cond_resched_lock - conditionally reschedule while holding a lock */ +#define cond_resched_lock(lock) do { (void)(lock); } while (0) + +/* More JBD2 trace stubs for commit.c */ +#define trace_jbd2_submit_inode_data(i) do { (void)(i); } while (0) +#define trace_jbd2_start_commit(j, t) do { (void)(j); (void)(t); } while (0) +#define trace_jbd2_commit_locking(j, t) do { (void)(j); (void)(t); } while (0) +#define trace_jbd2_commit_flushing(j, t) do { (void)(j); (void)(t); } while (0) +#define trace_jbd2_commit_logging(j, t) do { (void)(j); (void)(t); } while (0) +#define trace_jbd2_run_stats(d, tid, stats) \ + do { (void)(d); (void)(tid); (void)(stats); } while (0) +#define trace_jbd2_end_commit(j, t) do { (void)(j); (void)(t); } while (0) + +/* JBD2 journal.c stubs */ +#define alloc_buffer_head(gfp) ((struct buffer_head *)kzalloc(sizeof(struct buffer_head), gfp)) +#define __getblk(bdev, block, size) ({ (void)(bdev); (void)(block); (void)(size); (struct buffer_head *)NULL; }) +#define bmap(inode, block) ({ (void)(inode); (void)(block); 0; }) +#define trace_jbd2_update_log_tail(j, t, b, f) \ + do { (void)(j); (void)(t); (void)(b); (void)(f); } while (0) + +/* seq_file operations for /proc - stubs */ +#define seq_open(f, ops) ({ (void)(f); (void)(ops); 0; }) +#define seq_release(i, f) ({ (void)(i); (void)(f); 0; }) + +/* proc_ops structure for journal.c */ +struct proc_ops { + int (*proc_open)(struct inode *, struct file *); + ssize_t (*proc_read)(struct file *, char *, size_t, loff_t *); + loff_t (*proc_lseek)(struct file *, loff_t, int); + int (*proc_release)(struct inode *, struct file *); +}; + +/* seq_read and seq_lseek declarations (defined in stub.c) */ +ssize_t seq_read(struct file *f, char *b, size_t s, loff_t *p); +loff_t seq_lseek(struct file *f, loff_t o, int w); + +/* S_IRUGO file mode if not defined */ +#ifndef S_IRUGO +#define S_IRUGO (S_IRUSR | S_IRGRP | S_IROTH) +#endif + +/* procfs stubs */ +#define proc_mkdir(name, parent) ({ (void)(name); (void)(parent); (struct proc_dir_entry *)NULL; }) +#define proc_create_data(n, m, p, ops, d) \ + ({ (void)(n); (void)(m); (void)(p); (void)(ops); (void)(d); (struct proc_dir_entry *)NULL; }) +#define remove_proc_entry(n, p) do { (void)(n); (void)(p); } while (0) + +/* lockdep stubs (struct lock_class_key defined earlier) */ +#define lockdep_init_map(...) do { } while (0) + +/* More JBD2 trace stubs for journal.c */ +#define trace_jbd2_shrink_scan_enter(j, n, c) \ + do { (void)(j); (void)(n); (void)(c); } while (0) +#define trace_jbd2_shrink_scan_exit(j, n, s, c) \ + do { (void)(j); (void)(n); (void)(s); (void)(c); } while (0) +#define trace_jbd2_shrink_count(j, n, c) \ + do { (void)(j); (void)(n); (void)(c); } while (0) +#define trace_jbd2_write_superblock(j, f) \ + do { (void)(j); (void)(f); } while (0) + +/* Block device operations for journal.c */ +#define bh_read(bh, flags) ({ (void)(bh); (void)(flags); 0; }) +#define truncate_inode_pages_range(m, s, e) \ + do { (void)(m); (void)(s); (void)(e); } while (0) +#define blkdev_issue_discard(bdev, s, n, gfp) \ + ({ (void)(bdev); (void)(s); (void)(n); (void)(gfp); 0; }) +#define blkdev_issue_zeroout(bdev, s, n, gfp, f) \ + ({ (void)(bdev); (void)(s); (void)(n); (void)(gfp); (void)(f); 0; }) +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif +#define mapping_max_folio_order(m) ({ (void)(m); 0; }) + +/* Memory allocation for journal.c */ +#define __get_free_pages(gfp, order) ((unsigned long)memalign(PAGE_SIZE, PAGE_SIZE << (order))) +#define free_pages(addr, order) free((void *)(addr)) +#define get_order(size) ilog2(roundup_pow_of_two((size) / PAGE_SIZE)) + +/* Ratelimited printk for journal.c */ +#define pr_notice_ratelimited(fmt, ...) pr_notice(fmt, ##__VA_ARGS__) + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index 0458191c496..c9d1d3f99eb 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -96,10 +96,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) return 0; } -int jbd2_journal_force_commit_nested(journal_t *journal) -{ - return 0; -} +/* jbd2_journal_force_commit_nested is now in journal.c */ int jbd2__journal_restart(void *handle, int nblocks, int revoke_records, int gfp_mask) @@ -107,10 +104,7 @@ int jbd2__journal_restart(void *handle, int nblocks, int revoke_records, return 0; } -int jbd2_trans_will_send_data_barrier(journal_t *journal, unsigned long tid) -{ - return 0; -} +/* jbd2_trans_will_send_data_barrier is now in journal.c */ /* * Stubs for balloc.c @@ -189,53 +183,15 @@ struct extent_status; /* ext4_fc_replay_check_excluded is now in fast_commit.c */ -/* - * JBD2 fast commit stubs - */ -int jbd2_fc_get_buf(void *journal, struct buffer_head **bh_out) -{ - *bh_out = NULL; - return -ENOSPC; -} - -void jbd2_fc_release_bufs(void *journal) -{ -} - -int jbd2_fc_begin_commit(void *journal, unsigned int tid) -{ - return -EOPNOTSUPP; -} - -int jbd2_fc_end_commit(void *journal) -{ - return 0; -} - -int jbd2_fc_end_commit_fallback(void *journal) -{ - return 0; -} - -int jbd2_submit_inode_data(void *journal, void *jinode) -{ - return 0; -} - -int jbd2_wait_inode_data(void *journal, void *jinode) -{ - return 0; -} - -int jbd2_fc_wait_bufs(void *journal, int num) -{ - return 0; -} - -int jbd2_complete_transaction(void *journal, unsigned int tid) -{ - return 0; -} +/* jbd2_submit_inode_data is now in commit.c */ +/* jbd2_wait_inode_data is now in commit.c */ +/* jbd2_fc_get_buf is now in journal.c */ +/* jbd2_fc_release_bufs is now in journal.c */ +/* jbd2_fc_begin_commit is now in journal.c */ +/* jbd2_fc_end_commit is now in journal.c */ +/* jbd2_fc_end_commit_fallback is now in journal.c */ +/* jbd2_fc_wait_bufs is now in journal.c */ +/* jbd2_complete_transaction is now in journal.c */ void ext4_reset_inode_seed(struct inode *inode) { @@ -268,16 +224,8 @@ struct ext4_iloc; */ #include <linux/sched.h> -/* JBD2 stubs for inode.c */ -int jbd2_journal_blocks_per_folio(struct inode *inode) -{ - return 1; -} - -int jbd2_transaction_committed(void *journal, unsigned int tid) -{ - return 1; -} +/* jbd2_journal_blocks_per_folio is now in journal.c */ +/* jbd2_transaction_committed is now in journal.c */ /* __ext4_warning_inode is now in super.c */ @@ -318,10 +266,7 @@ void jbd2_journal_invalidate_folio(void *journal, void *folio, { } -int jbd2_log_wait_commit(void *journal, unsigned int tid) -{ - return 0; -} +/* jbd2_log_wait_commit is now in journal.c */ /* ext4_fc_track_range is now in fast_commit.c */ @@ -334,10 +279,7 @@ void jbd2_journal_unlock_updates(void *journal) { } -int jbd2_journal_flush(void *journal, unsigned int flags) -{ - return 0; -} +/* jbd2_journal_flush is now in journal.c */ /* ext4_fc_track_inode is now in fast_commit.c */ @@ -370,17 +312,14 @@ int jbd2_journal_inode_ranged_wait(void *handle, struct inode *inode, /* xattr stubs are now in xattr.c */ -/* JBD2 stubs for inode.c */ -struct kmem_cache *jbd2_inode_cache; +/* jbd2_inode_cache is now in journal.c */ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) { return 1; } -void jbd2_journal_init_jbd_inode(void *jinode, struct inode *inode) -{ -} +/* jbd2_journal_init_jbd_inode is now in journal.c */ /* ext4_read_inline_link is now in inline.c */ @@ -629,11 +568,7 @@ void ext4_unregister_sysfs(void *sb) { } -/* Journal destroy */ -int jbd2_journal_destroy(void *journal) -{ - return 0; -} +/* jbd2_journal_destroy is now in journal.c */ /* percpu rwsem */ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) @@ -771,11 +706,7 @@ void fsnotify_sb_error(struct super_block *sb, struct inode *inode, int error) { } -/* JBD2 force commit */ -int jbd2_journal_force_commit(void *journal) -{ - return 0; -} +/* jbd2_journal_force_commit is now in journal.c */ /* File path */ char *file_path(struct file *file, char *buf, int buflen) @@ -796,15 +727,9 @@ long ext4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -ENOTSUPP; } -/* JBD2 journal abort */ -void jbd2_journal_abort(void *journal, int error) -{ -} +/* jbd2_journal_abort is now in journal.c */ -/* JBD2 journal inode release */ -void jbd2_journal_release_jbd_inode(void *journal, void *jinode) -{ -} +/* jbd2_journal_release_jbd_inode is now in journal.c */ /* nop_mnt_idmap - no-op mount ID map for xattr.c */ struct mnt_idmap nop_mnt_idmap; @@ -833,30 +758,85 @@ void dquot_free_block(struct inode *inode, loff_t nr) * Note: These use void* to avoid pulling in jbd2.h types which would conflict */ -int jbd2_journal_get_log_tail(void *journal, void *tid, unsigned long *block) +/* jbd2_journal_get_log_tail is now in journal.c */ +/* __jbd2_update_log_tail is now in journal.c */ +/* jbd2_journal_grab_journal_head is now in journal.c */ +/* jbd2_journal_put_journal_head is now in journal.c */ + +void jbd2_journal_free_transaction(void *transaction) { - return 0; } -int __jbd2_update_log_tail(void *journal, unsigned int tid, unsigned long block) +/* jbd2_log_start_commit is now in journal.c */ + +/* jbd2_journal_get_descriptor_buffer is now in journal.c */ +/* jbd2_journal_update_sb_log_tail is now in journal.c */ +/* jbd2_free is now in journal.c */ + +/* journal_tag_bytes is now in journal.c */ + +void jbd2_journal_wait_updates(void *journal) { - return 0; } -void *jbd2_journal_grab_journal_head(struct buffer_head *bh) +void jbd2_journal_refile_buffer(void *journal, void *jh) { - return NULL; } -void jbd2_journal_put_journal_head(void *jh) +void jbd2_clear_buffer_revoked_flags(void *journal) { } -void jbd2_journal_free_transaction(void *transaction) +void jbd2_journal_switch_revoke_table(void *journal) +{ +} + +void jbd2_journal_write_revoke_records(void *journal, void *transaction, + int write_flags) +{ +} + +void jbd2_buffer_abort_trigger(void *jh, void *triggers) +{ +} + +/* jbd2_journal_next_log_block is now in journal.c */ +/* jbd2_journal_write_metadata_buffer is now in journal.c */ +/* jbd2_descriptor_block_csum_set is now in journal.c */ +/* jbd2_update_log_tail is now in journal.c */ + +void jbd2_journal_file_buffer(void *jh, void *transaction, int type) +{ +} + +void __jbd2_journal_refile_buffer(void *jh) +{ +} + +/* cond_resched_lock is now a macro in ext4_uboot.h */ + +/* + * JBD2 stubs for journal.c - functions from recovery.c, revoke.c, transaction.c + */ +int jbd2_journal_destroy_revoke(void *journal) { + return 0; +} + +int jbd2_journal_recover(void *journal) +{ + return 0; } -int jbd2_log_start_commit(void *journal, unsigned int tid) +int jbd2_journal_skip_recovery(void *journal) { return 0; } + +void jbd2_buffer_frozen_trigger(void *jh, void *mapped_data, void *triggers) +{ +} + +void __jbd2_journal_file_buffer(void *jh, void *transaction, int type) +{ +} diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile index 03e8ba0f2ec..f0f84bfb707 100644 --- a/fs/jbd2/Makefile +++ b/fs/jbd2/Makefile @@ -3,4 +3,4 @@ # Makefile for JBD2 journaling layer (used by ext4l) # -obj-y := checkpoint.o +obj-y := checkpoint.o commit.o journal.o diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7203d2d2624..d4e63a91e87 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -10,21 +10,8 @@ * part of the ext2fs journaling system. */ -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/jiffies.h> -#include <linux/crc32.h> -#include <linux/writeback.h> -#include <linux/backing-dev.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/bitops.h> -#include <trace/events/jbd2.h> /* * IO end handler for temporary buffer_heads handling writes to the journal. diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d480b94117c..0b77fd0f34b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -19,34 +19,8 @@ * journaling (ext2 can use a reserved inode for storing the log). */ -#include <linux/module.h> -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/freezer.h> -#include <linux/pagemap.h> -#include <linux/kthread.h> -#include <linux/poison.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/math64.h> -#include <linux/hash.h> -#include <linux/log2.h> -#include <linux/vmalloc.h> -#include <linux/backing-dev.h> -#include <linux/bitops.h> -#include <linux/ratelimit.h> -#include <linux/sched/mm.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/jbd2.h> - -#include <linux/uaccess.h> -#include <asm/page.h> #ifdef CONFIG_JBD2_DEBUG static ushort jbd2_journal_enable_debug __read_mostly; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add recovery.c to the jbd2 Makefile and update includes to use the ext4l compatibility layer. Add stubs for revoke.c functions that recovery.c depends on and tweak a few so things build. Fix sync_blockdev to return int instead of void. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/ext4_uboot.h | 4 +++- fs/ext4l/stub.c | 24 +++++++++++++++++++++--- fs/jbd2/Makefile | 2 +- fs/jbd2/recovery.c | 13 ++----------- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 0d24940e74e..25756d29cf6 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2162,7 +2162,7 @@ int percpu_init_rwsem(struct percpu_rw_semaphore *sem); void percpu_free_rwsem(struct percpu_rw_semaphore *sem); /* Block device sync - declarations for stub.c */ -void sync_blockdev(struct block_device *bdev); +int sync_blockdev(struct block_device *bdev); void invalidate_bdev(struct block_device *bdev); /* Kobject - declarations for stub.c */ @@ -2867,6 +2867,8 @@ loff_t seq_lseek(struct file *f, loff_t o, int w); /* Block device operations for journal.c */ #define bh_read(bh, flags) ({ (void)(bh); (void)(flags); 0; }) +#define bh_read_nowait(bh, flags) do { (void)(bh); (void)(flags); } while (0) +#define bh_readahead_batch(n, bhs, f) do { (void)(n); (void)(bhs); (void)(f); } while (0) #define truncate_inode_pages_range(m, s, e) \ do { (void)(m); (void)(s); (void)(e); } while (0) #define blkdev_issue_discard(bdev, s, n, gfp) \ diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index c9d1d3f99eb..b1b5bb93cd8 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -576,8 +576,9 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } /* Block device ops */ -void sync_blockdev(struct block_device *bdev) +int sync_blockdev(struct block_device *bdev) { + return 0; } void invalidate_bdev(struct block_device *bdev) @@ -823,16 +824,33 @@ int jbd2_journal_destroy_revoke(void *journal) return 0; } -int jbd2_journal_recover(void *journal) +/* jbd2_journal_recover is now in recovery.c */ +/* jbd2_journal_skip_recovery is now in recovery.c */ + +/* JBD2 stubs for recovery.c - functions from revoke.c */ +int jbd2_journal_init_revoke_table(void *journal) +{ + return 0; +} + +int jbd2_journal_test_revoke(void *journal, unsigned long blocknr, unsigned int tid) { return 0; } -int jbd2_journal_skip_recovery(void *journal) +int jbd2_journal_set_revoke(void *journal, unsigned long blocknr, unsigned int tid) { return 0; } +void jbd2_journal_clear_revoke(void *journal) +{ +} + +void jbd2_journal_destroy_revoke_table(void *table) +{ +} + void jbd2_buffer_frozen_trigger(void *jh, void *mapped_data, void *triggers) { } diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile index f0f84bfb707..bcc1f47177b 100644 --- a/fs/jbd2/Makefile +++ b/fs/jbd2/Makefile @@ -3,4 +3,4 @@ # Makefile for JBD2 journaling layer (used by ext4l) # -obj-y := checkpoint.o commit.o journal.o +obj-y := checkpoint.o commit.o journal.o recovery.o diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index cac8c2cd4a9..ff2266c7316 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -10,17 +10,8 @@ * part of the ext2fs journaling system. */ -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/crc32.h> -#include <linux/blkdev.h> -#include <linux/string_choices.h> -#endif /* * Maintain information about the progress of the recovery job, so that @@ -628,7 +619,7 @@ static int do_one_pass(journal_t *journal, * filesystems. */ hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), - 1U << 20); + 1UL << 20); journal->j_revoke = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke) { -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add revoke.c to the jbd2 Makefile and update its includes to use the compatibility layer. Add stubs for hash_64 and __find_get_block_nonatomic() Remove JBD2 revoke function stubs that are now provided by revoke.c. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/ext4_uboot.h | 5 ++++ fs/ext4l/stub.c | 57 +++++++++---------------------------------- fs/jbd2/Makefile | 2 +- fs/jbd2/revoke.c | 14 +---------- 4 files changed, 18 insertions(+), 60 deletions(-) diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 25756d29cf6..f43c3a999f9 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -1365,6 +1365,9 @@ typedef unsigned int projid_t; #include <log.h> #define ilog2(n) (fls(n) - 1) +/* hash_64 - simple 64-bit hash */ +#define hash_64(val, bits) ((unsigned long)((val) >> (64 - (bits)))) + /* Trace stubs for inode.c */ #define trace_ext4_begin_ordered_truncate(...) do { } while (0) #define trace_ext4_evict_inode(...) do { } while (0) @@ -2569,6 +2572,8 @@ struct seq_operations { /* Block device operations */ #define sb_find_get_block_nonatomic(sb, block) \ ({ (void)(sb); (void)(block); (struct buffer_head *)NULL; }) +#define __find_get_block_nonatomic(bdev, block, size) \ + ({ (void)(bdev); (void)(block); (void)(size); (struct buffer_head *)NULL; }) #define bdev_discard_granularity(bdev) \ ({ (void)(bdev); 0U; }) diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index b1b5bb93cd8..989ae2c85c1 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -80,11 +80,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) return 0; } -int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, - struct buffer_head *bh) -{ - return 0; -} +/* jbd2_journal_revoke is now in revoke.c */ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) { @@ -784,18 +780,9 @@ void jbd2_journal_refile_buffer(void *journal, void *jh) { } -void jbd2_clear_buffer_revoked_flags(void *journal) -{ -} - -void jbd2_journal_switch_revoke_table(void *journal) -{ -} - -void jbd2_journal_write_revoke_records(void *journal, void *transaction, - int write_flags) -{ -} +/* jbd2_clear_buffer_revoked_flags is now in revoke.c */ +/* jbd2_journal_switch_revoke_table is now in revoke.c */ +/* jbd2_journal_write_revoke_records is now in revoke.c */ void jbd2_buffer_abort_trigger(void *jh, void *triggers) { @@ -817,39 +804,17 @@ void __jbd2_journal_refile_buffer(void *jh) /* cond_resched_lock is now a macro in ext4_uboot.h */ /* - * JBD2 stubs for journal.c - functions from recovery.c, revoke.c, transaction.c + * JBD2 stubs for journal.c - functions from transaction.c */ -int jbd2_journal_destroy_revoke(void *journal) -{ - return 0; -} /* jbd2_journal_recover is now in recovery.c */ /* jbd2_journal_skip_recovery is now in recovery.c */ - -/* JBD2 stubs for recovery.c - functions from revoke.c */ -int jbd2_journal_init_revoke_table(void *journal) -{ - return 0; -} - -int jbd2_journal_test_revoke(void *journal, unsigned long blocknr, unsigned int tid) -{ - return 0; -} - -int jbd2_journal_set_revoke(void *journal, unsigned long blocknr, unsigned int tid) -{ - return 0; -} - -void jbd2_journal_clear_revoke(void *journal) -{ -} - -void jbd2_journal_destroy_revoke_table(void *table) -{ -} +/* jbd2_journal_destroy_revoke is now in revoke.c */ +/* jbd2_journal_init_revoke_table is now in revoke.c */ +/* jbd2_journal_test_revoke is now in revoke.c */ +/* jbd2_journal_set_revoke is now in revoke.c */ +/* jbd2_journal_clear_revoke is now in revoke.c */ +/* jbd2_journal_destroy_revoke_table is now in revoke.c */ void jbd2_buffer_frozen_trigger(void *jh, void *mapped_data, void *triggers) { diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile index bcc1f47177b..5501c364abc 100644 --- a/fs/jbd2/Makefile +++ b/fs/jbd2/Makefile @@ -3,4 +3,4 @@ # Makefile for JBD2 journaling layer (used by ext4l) # -obj-y := checkpoint.o commit.o journal.o recovery.o +obj-y := checkpoint.o commit.o journal.o recovery.o revoke.o diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 1467f679074..b5e6fa6cf96 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -77,20 +77,8 @@ * needed. */ -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/log2.h> -#include <linux/hash.h> -#endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add transaction.c to the jbd2 Makefile and update includes to use the ext4l compatibility layer. Add stubs for various functions needed by transaction.c including atomic_add_return, prepare_to_wait_exclusive, rwsem_acquire_read, hrtimer functions, and JBD2 trace stubs. Remove JBD2 transaction function stubs that are now provided by transaction.c. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/ext4_uboot.h | 35 ++++++++ fs/ext4l/stub.c | 182 +++++++----------------------------------- fs/jbd2/Makefile | 2 +- fs/jbd2/transaction.c | 15 +--- 4 files changed, 64 insertions(+), 170 deletions(-) diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index f43c3a999f9..e57e479a462 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -130,7 +130,9 @@ struct kobject { struct lockdep_map { int dummy; }; struct lock_class_key { int dummy; }; #define rwsem_acquire(l, s, t, i) do { } while (0) +#define rwsem_acquire_read(l, s, t, i) do { } while (0) #define rwsem_release(l, i) do { } while (0) +#define _THIS_IP_ ((unsigned long)0) /* completion - stub */ struct completion { @@ -140,6 +142,10 @@ struct completion { /* Cache alignment - stub */ #define ____cacheline_aligned_in_smp +/* Pointer check macros */ +#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= PAGE_SIZE) +#define data_race(expr) (expr) + /* Block I/O request flags - stubs */ #define REQ_META 0 #define REQ_PRIO 0 @@ -321,6 +327,8 @@ extern struct user_namespace init_user_ns; #define lock_buffer(bh) do { } while (0) #define unlock_buffer(bh) do { } while (0) #define sb_getblk(sb, block) ((struct buffer_head *)NULL) +#define test_clear_buffer_dirty(bh) ({ (void)(bh); 0; }) +#define wait_on_bit_io(addr, bit, mode) do { (void)(addr); (void)(bit); (void)(mode); } while (0) /* inode_needs_sync - stub */ #define inode_needs_sync(inode) (0) @@ -1068,6 +1076,7 @@ static inline unsigned long memweight(const void *ptr, size_t bytes) #define filemap_invalidate_lock_shared(m) do { } while (0) #define filemap_invalidate_unlock_shared(m) do { } while (0) #define filemap_write_and_wait_range(m, s, e) ({ (void)(m); (void)(s); (void)(e); 0; }) +#define filemap_fdatawrite_range(m, s, e) ({ (void)(m); (void)(s); (void)(e); 0; }) #define truncate_pagecache(i, s) do { } while (0) #define pagecache_isize_extended(i, f, t) do { } while (0) #define invalidate_mapping_pages(m, s, e) do { (void)(m); (void)(s); (void)(e); } while (0) @@ -1181,6 +1190,15 @@ static inline ktime_t ktime_sub(ktime_t a, ktime_t b) return a - b; } +static inline ktime_t ktime_add_ns(ktime_t kt, s64 ns) +{ + return kt + ns; +} + +/* hrtimer stubs */ +#define HRTIMER_MODE_ABS 0 +#define schedule_hrtimeout(exp, mode) ({ (void)(exp); (void)(mode); 0; }) + /* write lock variants */ #define write_trylock(lock) ({ (void)(lock); 1; }) @@ -2088,6 +2106,8 @@ struct fs_parse_result { #define time_after(a, b) time_before(b, a) #endif #define msecs_to_jiffies(m) ((m) * HZ / 1000) +#define jiffies_to_msecs(j) ((j) * 1000 / HZ) +#define round_jiffies_up(j) (j) /* Path lookup flags */ #define LOOKUP_FOLLOW 0x0001 @@ -2490,6 +2510,12 @@ static inline int atomic_inc_return(atomic_t *v) return ++(v->counter); } +/* atomic_add_return - add and return new value */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + return (v->counter += i); +} + /* pde_data - proc dir entry data (not supported in U-Boot) */ #define pde_data(inode) ((void *)NULL) @@ -2728,6 +2754,8 @@ struct wait_bit_entry { ({ (void)(word); (void)(bit); (wait_queue_head_t *)NULL; }) #define prepare_to_wait(wq, wait, state) \ do { (void)(wq); (void)(wait); (void)(state); } while (0) +#define prepare_to_wait_exclusive(wq, wait, state) \ + do { (void)(wq); (void)(wait); (void)(state); } while (0) #define finish_wait(wq, wait) \ do { (void)(wq); (void)(wait); } while (0) @@ -2823,6 +2851,13 @@ struct wait_bit_entry { do { (void)(d); (void)(tid); (void)(stats); } while (0) #define trace_jbd2_end_commit(j, t) do { (void)(j); (void)(t); } while (0) +/* JBD2 transaction.c trace stubs */ +#define trace_jbd2_handle_start(...) do { } while (0) +#define trace_jbd2_handle_extend(...) do { } while (0) +#define trace_jbd2_handle_restart(...) do { } while (0) +#define trace_jbd2_handle_stats(...) do { } while (0) +#define trace_jbd2_lock_buffer_stall(...) do { } while (0) + /* JBD2 journal.c stubs */ #define alloc_buffer_head(gfp) ((struct buffer_head *)kzalloc(sizeof(struct buffer_head), gfp)) #define __getblk(bdev, block, size) ({ (void)(bdev); (void)(block); (void)(size); (struct buffer_head *)NULL; }) diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index 989ae2c85c1..5448040e14c 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -31,75 +31,26 @@ typedef void bh_end_io_t(struct buffer_head *bh, int uptodate); /* ext4_decode_error is now in super.c */ /* - * JBD2 journal stubs + * JBD2 journal stubs - most now in transaction.c, journal.c, revoke.c */ struct jbd2_journal_handle; typedef struct jbd2_journal_handle handle_t; struct journal_s; typedef struct journal_s journal_t; -struct jbd2_buffer_trigger_type; - -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, - int revoke_records, int gfp_mask, int type, - unsigned int line_no) -{ - return NULL; -} - -int jbd2_journal_stop(handle_t *handle) -{ - return 0; -} - -void jbd2_journal_free_reserved(handle_t *handle) -{ -} - -int jbd2_journal_start_reserved(handle_t *handle, int type, unsigned int line) -{ - return 0; -} - -int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) -{ - return 0; -} - -int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) -{ - return 0; -} - -void jbd2_journal_set_triggers(struct buffer_head *bh, - struct jbd2_buffer_trigger_type *type) -{ -} - -int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) -{ - return 0; -} +/* jbd2__journal_start is now in transaction.c */ +/* jbd2_journal_stop is now in transaction.c */ +/* jbd2_journal_free_reserved is now in transaction.c */ +/* jbd2_journal_start_reserved is now in transaction.c */ +/* jbd2_journal_extend is now in transaction.c */ +/* jbd2_journal_get_write_access is now in transaction.c */ +/* jbd2_journal_set_triggers is now in transaction.c */ +/* jbd2_journal_forget is now in transaction.c */ /* jbd2_journal_revoke is now in revoke.c */ - -int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) -{ - return 0; -} - -int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) -{ - return 0; -} - +/* jbd2_journal_get_create_access is now in transaction.c */ +/* jbd2_journal_dirty_metadata is now in transaction.c */ /* jbd2_journal_force_commit_nested is now in journal.c */ - -int jbd2__journal_restart(void *handle, int nblocks, int revoke_records, - int gfp_mask) -{ - return 0; -} - +/* jbd2__journal_restart is now in transaction.c */ /* jbd2_trans_will_send_data_barrier is now in journal.c */ /* @@ -232,61 +183,22 @@ struct ext4_iloc; /* Xattr functions are now in xattr.c */ -/* More JBD2 stubs */ -int jbd2_journal_inode_ranged_write(void *handle, struct inode *inode, - loff_t start, loff_t len) -{ - return 0; -} - - +/* jbd2_journal_inode_ranged_write is now in transaction.c */ /* ext4_read_bh_lock is now in super.c */ - -/* Fast commit */ /* ext4_fc_commit is now in fast_commit.c */ - /* ext4_force_commit is now in super.c */ - /* Inline data is now in inline.c */ - /* I/O submit stubs are now in page-io.c */ - -/* JBD2 ordered truncate */ -int jbd2_journal_begin_ordered_truncate(void *ji, loff_t new_size) -{ - return 0; -} - -void jbd2_journal_invalidate_folio(void *journal, void *folio, - unsigned long off, unsigned int len) -{ -} - +/* jbd2_journal_begin_ordered_truncate is now in transaction.c */ +/* jbd2_journal_invalidate_folio is now in transaction.c */ /* jbd2_log_wait_commit is now in journal.c */ - /* ext4_fc_track_range is now in fast_commit.c */ - -/* JBD2 journal update locking */ -void jbd2_journal_lock_updates(void *journal) -{ -} - -void jbd2_journal_unlock_updates(void *journal) -{ -} - +/* jbd2_journal_lock_updates is now in transaction.c */ +/* jbd2_journal_unlock_updates is now in transaction.c */ /* jbd2_journal_flush is now in journal.c */ - - /* ext4_fc_track_inode is now in fast_commit.c */ /* ext4_fc_init_inode is now in fast_commit.c */ - -/* JBD2 */ -int jbd2_journal_inode_ranged_wait(void *handle, struct inode *inode, - loff_t start, loff_t len) -{ - return 0; -} +/* jbd2_journal_inode_ranged_wait is now in transaction.c */ /* Inline data functions are now in inline.c */ @@ -309,12 +221,7 @@ int jbd2_journal_inode_ranged_wait(void *handle, struct inode *inode, /* xattr stubs are now in xattr.c */ /* jbd2_inode_cache is now in journal.c */ - -int jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) -{ - return 1; -} - +/* jbd2_journal_try_to_free_buffers is now in transaction.c */ /* jbd2_journal_init_jbd_inode is now in journal.c */ /* ext4_read_inline_link is now in inline.c */ @@ -759,54 +666,25 @@ void dquot_free_block(struct inode *inode, loff_t nr) /* __jbd2_update_log_tail is now in journal.c */ /* jbd2_journal_grab_journal_head is now in journal.c */ /* jbd2_journal_put_journal_head is now in journal.c */ - -void jbd2_journal_free_transaction(void *transaction) -{ -} - +/* jbd2_journal_free_transaction is now in transaction.c */ /* jbd2_log_start_commit is now in journal.c */ - /* jbd2_journal_get_descriptor_buffer is now in journal.c */ /* jbd2_journal_update_sb_log_tail is now in journal.c */ /* jbd2_free is now in journal.c */ - /* journal_tag_bytes is now in journal.c */ - -void jbd2_journal_wait_updates(void *journal) -{ -} - -void jbd2_journal_refile_buffer(void *journal, void *jh) -{ -} - +/* jbd2_journal_wait_updates is now in transaction.c */ +/* jbd2_journal_refile_buffer is now in transaction.c */ /* jbd2_clear_buffer_revoked_flags is now in revoke.c */ /* jbd2_journal_switch_revoke_table is now in revoke.c */ /* jbd2_journal_write_revoke_records is now in revoke.c */ - -void jbd2_buffer_abort_trigger(void *jh, void *triggers) -{ -} - +/* jbd2_buffer_abort_trigger is now in transaction.c */ /* jbd2_journal_next_log_block is now in journal.c */ /* jbd2_journal_write_metadata_buffer is now in journal.c */ /* jbd2_descriptor_block_csum_set is now in journal.c */ /* jbd2_update_log_tail is now in journal.c */ - -void jbd2_journal_file_buffer(void *jh, void *transaction, int type) -{ -} - -void __jbd2_journal_refile_buffer(void *jh) -{ -} - +/* jbd2_journal_file_buffer is now in transaction.c */ +/* __jbd2_journal_refile_buffer is now in transaction.c */ /* cond_resched_lock is now a macro in ext4_uboot.h */ - -/* - * JBD2 stubs for journal.c - functions from transaction.c - */ - /* jbd2_journal_recover is now in recovery.c */ /* jbd2_journal_skip_recovery is now in recovery.c */ /* jbd2_journal_destroy_revoke is now in revoke.c */ @@ -815,11 +693,5 @@ void __jbd2_journal_refile_buffer(void *jh) /* jbd2_journal_set_revoke is now in revoke.c */ /* jbd2_journal_clear_revoke is now in revoke.c */ /* jbd2_journal_destroy_revoke_table is now in revoke.c */ - -void jbd2_buffer_frozen_trigger(void *jh, void *mapped_data, void *triggers) -{ -} - -void __jbd2_journal_file_buffer(void *jh, void *transaction, int type) -{ -} +/* jbd2_buffer_frozen_trigger is now in transaction.c */ +/* __jbd2_journal_file_buffer is now in transaction.c */ diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile index 5501c364abc..820417d36ca 100644 --- a/fs/jbd2/Makefile +++ b/fs/jbd2/Makefile @@ -3,4 +3,4 @@ # Makefile for JBD2 journaling layer (used by ext4l) # -obj-y := checkpoint.o commit.o journal.o recovery.o revoke.o +obj-y := checkpoint.o commit.o journal.o recovery.o revoke.o transaction.o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3e510564de6..a524f490c79 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -14,21 +14,8 @@ * filesystem). */ -#include <linux/time.h> -#include <linux/fs.h> +#include "../ext4l/ext4_uboot.h" #include <linux/jbd2.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/hrtimer.h> -#include <linux/backing-dev.h> -#include <linux/bug.h> -#include <linux/module.h> -#include <linux/sched/mm.h> - -#include <trace/events/jbd2.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add extent migration support for write operations. Use the ext4_uboot.h compatibility layer instead of Linux headers. Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/Makefile | 2 +- fs/ext4l/migrate.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile index 4003a1f2317..784ba7203b1 100644 --- a/fs/ext4l/Makefile +++ b/fs/ext4l/Makefile @@ -7,7 +7,7 @@ obj-y := interface.o stub.o obj-y += balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsync.o hash.o ialloc.o \ - indirect.o inline.o inode.o mballoc.o \ + indirect.o inline.o inode.o mballoc.o migrate.o \ namei.o page-io.o readpage.o \ super.o symlink.o xattr.o \ xattr_hurd.o xattr_trusted.o \ diff --git a/fs/ext4l/migrate.c b/fs/ext4l/migrate.c index 1b0dfd963d3..63d881767e5 100644 --- a/fs/ext4l/migrate.c +++ b/fs/ext4l/migrate.c @@ -5,7 +5,7 @@ * */ -#include <linux/slab.h> +#include "ext4_uboot.h" #include "ext4_jbd2.h" #include "ext4_extents.h" -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add support for multi-mount protection. Use ext4_uboot.h compatibility layer and add stubs for: - init_utsname() - returns static node name "u-boot" - sb_start_write/sb_end_write - no-op stubs Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/Makefile | 2 +- fs/ext4l/ext4_uboot.h | 17 ++++++++++++++++- fs/ext4l/mmp.c | 6 +----- fs/ext4l/stub.c | 11 ++--------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile index 784ba7203b1..df3c016e3c1 100644 --- a/fs/ext4l/Makefile +++ b/fs/ext4l/Makefile @@ -8,7 +8,7 @@ obj-y := interface.o stub.o obj-y += balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o mballoc.o migrate.o \ - namei.o page-io.o readpage.o \ + mmp.o namei.o page-io.o readpage.o \ super.o symlink.o xattr.o \ xattr_hurd.o xattr_trusted.o \ xattr_user.o fast_commit.o orphan.o diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index e57e479a462..409391dda20 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2271,10 +2271,11 @@ int is_power_of_2(unsigned long n); /* Superblock write operations */ #define sb_start_write_trylock(sb) ({ (void)(sb); 1; }) +#define sb_start_write(sb) do { } while (0) #define sb_end_write(sb) do { } while (0) /* Scheduler stubs */ -#define schedule_timeout_interruptible(t) do { } while (0) +#define schedule_timeout_interruptible(t) ({ (void)(t); 0; }) /* Page allocation - declarations for stub.c */ unsigned long get_zeroed_page(gfp_t gfp); @@ -2928,4 +2929,18 @@ loff_t seq_lseek(struct file *f, loff_t o, int w); /* Ratelimited printk for journal.c */ #define pr_notice_ratelimited(fmt, ...) pr_notice(fmt, ##__VA_ARGS__) +/* + * Stubs for mmp.c + */ + +/* init_utsname - returns pointer to system name structure */ +struct new_utsname { + char nodename[65]; +}; +static inline struct new_utsname *init_utsname(void) +{ + static struct new_utsname uts = { .nodename = "u-boot" }; + return &uts; +} + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/mmp.c b/fs/ext4l/mmp.c index ab1ff51302f..2da65c022f4 100644 --- a/fs/ext4l/mmp.c +++ b/fs/ext4l/mmp.c @@ -1,10 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/fs.h> -#include <linux/random.h> -#include <linux/buffer_head.h> -#include <linux/utsname.h> -#include <linux/kthread.h> +#include "ext4_uboot.h" #include "ext4.h" /* Checksumming functions */ diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index 5448040e14c..1bf1e958e48 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -461,10 +461,7 @@ int dquot_suspend(void *sb, int flags) return 0; } -/* MMP daemon */ -void ext4_stop_mmpd(void *sbi) -{ -} +/* MMP daemon - now in mmp.c */ /* Sysfs */ void ext4_unregister_sysfs(void *sb) @@ -552,11 +549,7 @@ u64 sb_bdev_nr_blocks(struct super_block *sb) /* bdev_can_atomic_write is now a macro in ext4_uboot.h */ /* bdev_atomic_write_unit_max_bytes is now a macro in ext4_uboot.h */ -/* Multi-mount protection */ -int ext4_multi_mount_protect(void *sb, unsigned long long mmp_block) -{ - return 0; -} +/* Multi-mount protection - now in mmp.c */ /* Generic dentry ops */ void generic_set_sb_d_ops(struct super_block *sb) -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add support for extent-moving for write operations. Use ext4_uboot.h compatibility layer and add stubs for: - down_write_nested - forwards to down_write - filemap_release_folio, IS_SWAPFILE, PAGE_MASK - lock_two_nondirectories, unlock_two_nondirectories Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/Makefile | 2 +- fs/ext4l/ext4_uboot.h | 26 ++++++++++++++++++++++++++ fs/ext4l/move_extent.c | 5 +---- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile index df3c016e3c1..4afcc43e34f 100644 --- a/fs/ext4l/Makefile +++ b/fs/ext4l/Makefile @@ -8,7 +8,7 @@ obj-y := interface.o stub.o obj-y += balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o mballoc.o migrate.o \ - mmp.o namei.o page-io.o readpage.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o \ super.o symlink.o xattr.o \ xattr_hurd.o xattr_trusted.o \ xattr_user.o fast_commit.o orphan.o diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 409391dda20..617ec9a8afe 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2943,4 +2943,30 @@ static inline struct new_utsname *init_utsname(void) return &uts; } +/* + * Stubs for move_extent.c + */ + +/* down_write_nested - nested write lock acquisition */ +#define down_write_nested(sem, subclass) \ + do { (void)(sem); (void)(subclass); } while (0) + +/* filemap_release_folio - try to release a folio */ +#define filemap_release_folio(folio, gfp) \ + ({ (void)(folio); (void)(gfp); 1; }) + +/* IS_SWAPFILE - check if inode is a swap file */ +#define IS_SWAPFILE(inode) ({ (void)(inode); 0; }) + +/* PAGE_MASK - mask for page alignment */ +#ifndef PAGE_MASK +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +/* lock_two_nondirectories - lock two inodes in order */ +#define lock_two_nondirectories(i1, i2) \ + do { (void)(i1); (void)(i2); } while (0) +#define unlock_two_nondirectories(i1, i2) \ + do { (void)(i1); (void)(i2); } while (0) + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/move_extent.c b/fs/ext4l/move_extent.c index 4b091c21908..db3066b22a8 100644 --- a/fs/ext4l/move_extent.c +++ b/fs/ext4l/move_extent.c @@ -5,10 +5,7 @@ * Akira Fujita <a-fujita@rs.jp.nec.com> */ -#include <linux/fs.h> -#include <linux/quotaops.h> -#include <linux/slab.h> -#include <linux/sched/mm.h> +#include "ext4_uboot.h" #include "ext4_jbd2.h" #include "ext4.h" #include "ext4_extents.h" -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Add filesystem-resize support. Use ext4_uboot.h compatibility layer and add stubs for: - test_and_set_bit_lock - forwards to test_and_set_bit - div64_u64 - simple 64-bit division - time_is_before_jiffies - always returns false Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/Makefile | 2 +- fs/ext4l/ext4_uboot.h | 19 +++++++++++++++++++ fs/ext4l/resize.c | 6 +----- fs/ext4l/stub.c | 7 +++++-- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile index 4afcc43e34f..ee929539eb8 100644 --- a/fs/ext4l/Makefile +++ b/fs/ext4l/Makefile @@ -8,7 +8,7 @@ obj-y := interface.o stub.o obj-y += balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o mballoc.o migrate.o \ - mmp.o move_extent.o namei.o page-io.o readpage.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o xattr.o \ xattr_hurd.o xattr_trusted.o \ xattr_user.o fast_commit.o orphan.o diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 617ec9a8afe..8a3143a6d27 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2969,4 +2969,23 @@ static inline struct new_utsname *init_utsname(void) #define unlock_two_nondirectories(i1, i2) \ do { (void)(i1); (void)(i2); } while (0) +/* + * Stubs for resize.c + */ + +/* test_and_set_bit_lock - test and set a bit atomically */ +#define test_and_set_bit_lock(nr, addr) test_and_set_bit(nr, addr) + +/* div64_u64 - 64-bit by 64-bit division */ +static inline u64 div64_u64(u64 dividend, u64 divisor) +{ + return dividend / divisor; +} + +/* time_is_before_jiffies - check if time is before current jiffies */ +#define time_is_before_jiffies(a) ({ (void)(a); 0; }) + +/* ext4_update_overhead - declaration for stub.c */ +int ext4_update_overhead(struct super_block *sb, bool force); + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/resize.c b/fs/ext4l/resize.c index 050f26168d9..a52eac5f8e1 100644 --- a/fs/ext4l/resize.c +++ b/fs/ext4l/resize.c @@ -9,11 +9,7 @@ * This could probably be made into a module, because it is not often in use. */ - -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/jiffies.h> - +#include "ext4_uboot.h" #include "ext4_jbd2.h" struct ext4_rcu_ptr { diff --git a/fs/ext4l/stub.c b/fs/ext4l/stub.c index 1bf1e958e48..93315d10fa8 100644 --- a/fs/ext4l/stub.c +++ b/fs/ext4l/stub.c @@ -379,9 +379,12 @@ void *kvzalloc(size_t size, gfp_t flags) return calloc(1, size); } -void ext4_kvfree_array_rcu(void *p) +/* ext4_kvfree_array_rcu - now in resize.c */ + +/* ext4_update_overhead - stub for resize.c */ +int ext4_update_overhead(struct super_block *sb, bool force) { - free(p); + return 0; } /* String stubs */ -- 2.43.0
From: Simon Glass <simon.glass@canonical.com> Update fsmap.c to use ext4_uboot.h and add the required stubs: - struct fsmap and related FMR_* macros - trace_ext4_fsmap_* stub macros - list_sort and sort stub macros Co-developed-by: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Simon Glass <simon.glass@canonical.com> --- fs/ext4l/Makefile | 2 +- fs/ext4l/ext4_uboot.h | 32 ++++++++++++++++++++++++++++++++ fs/ext4l/fsmap.c | 5 +---- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/ext4l/Makefile b/fs/ext4l/Makefile index ee929539eb8..e99b900ca6d 100644 --- a/fs/ext4l/Makefile +++ b/fs/ext4l/Makefile @@ -6,7 +6,7 @@ obj-y := interface.o stub.o obj-y += balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ - extents_status.o file.o fsync.o hash.o ialloc.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o xattr.o \ diff --git a/fs/ext4l/ext4_uboot.h b/fs/ext4l/ext4_uboot.h index 8a3143a6d27..1ad9ad7f47a 100644 --- a/fs/ext4l/ext4_uboot.h +++ b/fs/ext4l/ext4_uboot.h @@ -2988,4 +2988,36 @@ static inline u64 div64_u64(u64 dividend, u64 divisor) /* ext4_update_overhead - declaration for stub.c */ int ext4_update_overhead(struct super_block *sb, bool force); +/* + * Stubs for fsmap.c + */ + +/* fsmap.c stubs - struct fsmap from linux/fsmap.h */ +struct fsmap { + __u32 fmr_device; /* device id */ + __u32 fmr_flags; /* mapping flags */ + __u64 fmr_physical; /* device offset of segment */ + __u64 fmr_owner; /* owner id */ + __u64 fmr_offset; /* file offset of segment */ + __u64 fmr_length; /* length of segment */ + __u64 fmr_reserved[3]; /* must be zero */ +}; + +#define FMR_OWN_FREE (-1ULL) +#define FMR_OWN_UNKNOWN (-2ULL) +#define FMR_OWNER(type, code) (((__u64)(type) << 32) | (__u64)(code)) +#define FMR_OF_SPECIAL_OWNER (1 << 0) +#define FMH_IF_VALID 0 +#define FMH_OF_DEV_T (1 << 0) + +#define trace_ext4_fsmap_mapping(sb, d, a, p, l, o) do { } while (0) +#define trace_ext4_fsmap_low_key(sb, d, a, p, l, o) do { } while (0) +#define trace_ext4_fsmap_high_key(sb, d, a, p, l, o) do { } while (0) + +/* list_sort and sort stubs for fsmap.c */ +#define list_sort(priv, head, cmp) \ + do { (void)(priv); (void)(head); (void)(cmp); } while (0) +#define sort(base, num, size, cmp, swap) \ + do { (void)(base); (void)(num); (void)(size); (void)(cmp); (void)(swap); } while (0) + #endif /* __EXT4_UBOOT_H__ */ diff --git a/fs/ext4l/fsmap.c b/fs/ext4l/fsmap.c index 22fc333244e..da9ee2fac8e 100644 --- a/fs/ext4l/fsmap.c +++ b/fs/ext4l/fsmap.c @@ -4,13 +4,10 @@ * * Author: Darrick J. Wong <darrick.wong@oracle.com> */ +#include "ext4_uboot.h" #include "ext4.h" -#include <linux/fsmap.h> #include "fsmap.h" #include "mballoc.h" -#include <linux/sort.h> -#include <linux/list_sort.h> -#include <trace/events/ext4.h> /* Convert an ext4_fsmap to an fsmap. */ void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, -- 2.43.0
participants (1)
-
Simon Glass