From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/fs/jbd2/checkpoint.c | 8 +- kernel/fs/jbd2/commit.c | 22 +-- kernel/fs/jbd2/journal.c | 58 +++---- kernel/fs/jbd2/recovery.c | 26 ++-- kernel/fs/jbd2/revoke.c | 19 +-- kernel/fs/jbd2/transaction.c | 354 +++++++++++++++++++++++++++---------------- 6 files changed, 279 insertions(+), 208 deletions(-) (limited to 'kernel/fs/jbd2') diff --git a/kernel/fs/jbd2/checkpoint.c b/kernel/fs/jbd2/checkpoint.c index 78c1545a3..6e18a06aa 100644 --- a/kernel/fs/jbd2/checkpoint.c +++ b/kernel/fs/jbd2/checkpoint.c @@ -429,7 +429,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) struct journal_head *last_jh; struct journal_head *next_jh = jh; int ret; - int freed = 0; if (!jh) return 0; @@ -443,10 +442,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) else ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) - return freed; + return 0; if (ret == 2) return 1; - freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -454,10 +452,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * requested: */ if (need_resched()) - return freed; + return 0; } while (jh != last_jh); - return freed; + return 0; } /* diff --git a/kernel/fs/jbd2/commit.c b/kernel/fs/jbd2/commit.c index 362e5f614..36345fefa 100644 --- a/kernel/fs/jbd2/commit.c +++ b/kernel/fs/jbd2/commit.c @@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { tmp->h_chksum_type = JBD2_CRC32_CHKSUM; tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); @@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + !jbd2_has_feature_async_commit(journal)) ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); else ret = submit_bh(WRITE_SYNC, bh); @@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(j)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); else tag->t_checksum = cpu_to_be16(csum32); @@ -730,8 +728,7 @@ start_journal_io: /* * Compute checksum. */ - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { crc32_sum = jbd2_checksum_data(crc32_sum, bh); } @@ -797,8 +794,7 @@ start_journal_io: blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); /* Done it all: now write the commit record asynchronously. */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -889,8 +885,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_JFLUSH; write_unlock(&journal->j_state_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -898,8 +893,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); } diff --git a/kernel/fs/jbd2/journal.c b/kernel/fs/jbd2/journal.c index 7003c0925..81e622681 100644 --- a/kernel/fs/jbd2/journal.c +++ b/kernel/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!jbd2_journal_has_csum_v2or3(j)) + if (!jbd2_journal_has_csum_v2or3_feature(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); @@ -1144,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, { journal_t *journal = journal_init_common(); struct buffer_head *bh; - char *p; int n; if (!journal) @@ -1157,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_blk_offset = start; journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; + strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -1211,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; - p = journal->j_devname + strlen(journal->j_devname); + p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", @@ -1471,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_SYNC); + jbd2_write_superblock(journal, WRITE_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1538,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " "at the same time!\n"); goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " "at the same time!\n"); @@ -1560,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal) } /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3(journal)) { + if (jbd2_journal_has_csum_v2or3_feature(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1573,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; goto out; } @@ -1664,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal) printk(KERN_ERR "JBD2: journal transaction %u on %s " "is corrupt.\n", journal->j_failed_commit, journal->j_devname); - return -EIO; + return -EFSCORRUPTED; } /* OK, we've finished with the dynamic journal bits: @@ -2086,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); - if (errno) + if (errno) { jbd2_journal_update_sb_errno(journal); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); + } } /** @@ -2212,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal) { size_t sz; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(journal)) return sizeof(journal_block_tag3_t); sz = sizeof(journal_block_tag_t); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_has_feature_csum2(journal)) sz += sizeof(__u16); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) return sz; else return sz - sizeof(__u32); @@ -2363,7 +2353,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY, /* flags */ + SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2395,10 +2385,8 @@ static struct journal_head *journal_alloc_journal_head(void) if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); - while (!ret) { - yield(); - ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); - } + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); } return ret; } diff --git a/kernel/fs/jbd2/recovery.c b/kernel/fs/jbd2/recovery.c index a9079d035..7f277e49f 100644 --- a/kernel/fs/jbd2/recovery.c +++ b/kernel/fs/jbd2/recovery.c @@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, if (offset >= journal->j_maxlen) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); - return -EIO; + return -EFSCORRUPTED; } err = jbd2_journal_bmap(journal, offset, &blocknr); @@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal, journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); else return tag->t_checksum == cpu_to_be16(csum32); @@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal, printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); - err = -EIO; + err = -EFSBADCRC; brelse(bh); goto failed; } @@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal, * just skip over the blocks it describes. */ if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM) && + jbd2_has_feature_checksum(journal) && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, @@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal, journal, tag, obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); - success = -EIO; + success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", @@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + jbd2_has_feature_checksum(journal)) { int chksum_err, chksum_seen; struct commit_header *cbh = (struct commit_header *)bh->b_data; @@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal, if (chksum_err) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal, bh->b_data)) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) - return -EINVAL; + return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); @@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, return -EINVAL; max = rcount; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) record_len = 8; while (offset + record_len <= max) { diff --git a/kernel/fs/jbd2/revoke.c b/kernel/fs/jbd2/revoke.c index 14214da80..705ae5778 100644 --- a/kernel/fs/jbd2/revoke.c +++ b/kernel/fs/jbd2/revoke.c @@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, { struct list_head *hash_list; struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; -repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) - goto oom; + return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; @@ -154,13 +156,6 @@ repeat: list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; } /* Find a revoke record in the journal's hash table. */ @@ -594,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; @@ -624,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else diff --git a/kernel/fs/jbd2/transaction.c b/kernel/fs/jbd2/transaction.c index ff2f2e6ad..ca181e81c 100644 --- a/kernel/fs/jbd2/transaction.c +++ b/kernel/fs/jbd2/transaction.c @@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks, * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); + + /* + * Is the number of reserved credits in the current transaction too + * big to fit this handle? Wait until reserved credits are freed. + */ + if (atomic_read(&journal->j_reserved_credits) + total > + journal->j_max_transaction_buffers) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + total <= + journal->j_max_transaction_buffers); + return 1; + } + wait_transaction_locked(journal); return 1; } @@ -262,38 +276,36 @@ static int start_this_handle(journal_t *journal, handle_t *handle, int rsv_blocks = 0; unsigned long ts = jiffies; + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + /* - * 1/2 of transaction can be reserved so we can practically handle - * only 1/2 of maximum transaction size per operation + * Limit the number of reserved credits to 1/2 of maximum transaction + * size and limit the number of total credits to not exceed maximum + * transaction size per operation. */ - if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { - printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, blocks, - journal->j_max_transaction_buffers / 2); + if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || + (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { + printk(KERN_ERR "JBD2: %s wants too many credits " + "credits:%d rsv_credits:%d max:%d\n", + current->comm, blocks, rsv_blocks, + journal->j_max_transaction_buffers); + WARN_ON(1); return -ENOSPC; } - if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; - alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); @@ -761,6 +773,30 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + struct page *page; + int offset; + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + page = bh->b_page; + offset = offset_in_page(bh->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); + memcpy(jh->b_frozen_data, source + offset, bh->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -780,7 +816,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal_t *journal; int error; char *frozen_buffer = NULL; - int need_copy = 0; unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) @@ -866,6 +901,26 @@ repeat: */ jh->b_modified = 0; + /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + goto done; + } /* * If there is already a copy-out version of this buffer, then we don't * need to make another one @@ -873,113 +928,70 @@ repeat: if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; + goto attach_next; } - /* Is there data here we need to preserve? */ + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (buffer_shadow(bh)) { - JBUFFER_TRACE(jh, "on shadow: sleep"); + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - wait_on_bit_io(&bh->b_state, BH_Shadow, - TASK_UNINTERRUPTIBLE); - goto repeat; - } - - /* - * Only do the copy if the currently-owning transaction still - * needs it. If buffer isn't on BJ_Metadata list, the - * committing transaction is past that stage (here we use the - * fact that BH_Shadow is set under bh_state lock together with - * refiling to BJ_Shadow list and at this point we know the - * buffer doesn't have BH_Shadow set). - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. - */ - if (jh->b_jlist == BJ_Metadata || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd2_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; + printk(KERN_ERR "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + goto out; } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; + goto repeat; } - jh->b_next_transaction = transaction; + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); } - - +attach_next: /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } + smp_wmb(); + jh->b_next_transaction = transaction; done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, - jh->b_triggers); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - - /* - * Now that the frozen data is saved off, we need to store - * any matching triggers. - */ - jh->b_frozen_triggers = jh->b_triggers; - } jbd_unlock_bh_state(bh); /* @@ -996,6 +1008,59 @@ out: return error; } +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, + bool undo) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + /* For undo access buffer must have data copied */ + if (undo && !jh->b_committed_data) + goto out; + if (jh->b_transaction != handle->h_transaction && + jh->b_next_transaction != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /** * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to @@ -1009,9 +1074,13 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; int rc; + if (jbd2_write_access_granted(handle, bh, false)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ @@ -1141,11 +1210,14 @@ out: int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); + if (jbd2_write_access_granted(handle, bh, true)) + return 0; + jh = jbd2_journal_add_journal_head(bh); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1230,8 +1302,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1264,12 +1334,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + /* + * This and the following assertions are unreliable since we may see jh + * in inconsistent state unless we grab bh_state lock. But this is + * crucial to catch bugs so let's do a reliable check until the + * lockless handling is fully proven. + */ + if (jh->b_transaction != transaction && + jh->b_next_transaction != transaction) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + jbd_unlock_bh_state(bh); + } + if (jh->b_modified == 1) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1360,7 +1459,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -1843,8 +1941,8 @@ out: * @journal: journal for operation * @page: to try and free * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. + * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit + * code to release the buffers. * * * For all the buffers on this page, @@ -2058,6 +2156,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, if (!buffer_dirty(bh)) { /* bdflush has written it. We can drop it now */ + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } @@ -2087,6 +2186,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, /* The orphan record's transaction has * committed. We can cleanse this buffer */ clear_buffer_jbddirty(bh); + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } } -- cgit 1.2.3-korg