summaryrefslogtreecommitdiffstats
path: root/kernel/fs/jbd2
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/fs/jbd2
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/fs/jbd2')
-rw-r--r--kernel/fs/jbd2/checkpoint.c8
-rw-r--r--kernel/fs/jbd2/commit.c22
-rw-r--r--kernel/fs/jbd2/journal.c58
-rw-r--r--kernel/fs/jbd2/recovery.c26
-rw-r--r--kernel/fs/jbd2/revoke.c19
-rw-r--r--kernel/fs/jbd2/transaction.c354
6 files changed, 279 insertions, 208 deletions
diff --git a/kernel/fs/jbd2/checkpoint.c b/kernel/fs/jbd2/checkpoint.c
index 78c1545a3..6e18a06aa 100644
--- a/kernel/fs/jbd2/checkpoint.c
+++ b/kernel/fs/jbd2/checkpoint.c
@@ -429,7 +429,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
- int freed = 0;
if (!jh)
return 0;
@@ -443,10 +442,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
else
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
if (!ret)
- return freed;
+ return 0;
if (ret == 2)
return 1;
- freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
@@ -454,10 +452,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
* requested:
*/
if (need_resched())
- return freed;
+ return 0;
} while (jh != last_jh);
- return freed;
+ return 0;
}
/*
diff --git a/kernel/fs/jbd2/commit.c b/kernel/fs/jbd2/commit.c
index 362e5f614..36345fefa 100644
--- a/kernel/fs/jbd2/commit.c
+++ b/kernel/fs/jbd2/commit.c
@@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
@@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ !jbd2_has_feature_async_commit(journal))
ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
else
ret = submit_bh(WRITE_SYNC, bh);
@@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(j))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
else
tag->t_checksum = cpu_to_be16(csum32);
@@ -730,8 +728,7 @@ start_journal_io:
/*
* Compute checksum.
*/
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
crc32_sum =
jbd2_checksum_data(crc32_sum, bh);
}
@@ -797,8 +794,7 @@ start_journal_io:
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
/* Done it all: now write the commit record asynchronously. */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -889,8 +885,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -898,8 +893,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
}
diff --git a/kernel/fs/jbd2/journal.c b/kernel/fs/jbd2/journal.c
index 7003c0925..81e622681 100644
--- a/kernel/fs/jbd2/journal.c
+++ b/kernel/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!jbd2_journal_has_csum_v2or3(j))
+ if (!jbd2_journal_has_csum_v2or3_feature(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-retry_alloc:
- new_bh = alloc_buffer_head(GFP_NOFS);
- if (!new_bh) {
- /*
- * Failure is not an option, but __GFP_NOFAIL is going
- * away; so we retry ourselves here.
- */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry_alloc;
- }
+ new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
@@ -1144,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
{
journal_t *journal = journal_init_common();
struct buffer_head *bh;
- char *p;
int n;
if (!journal)
@@ -1157,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
journal->j_blk_offset = start;
journal->j_maxlen = len;
bdevname(journal->j_dev, journal->j_devname);
- p = journal->j_devname;
- while ((p = strchr(p, '/')))
- *p = '!';
+ strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
@@ -1211,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
journal->j_inode = inode;
bdevname(journal->j_dev, journal->j_devname);
- p = journal->j_devname;
- while ((p = strchr(p, '/')))
- *p = '!';
- p = journal->j_devname + strlen(journal->j_devname);
+ p = strreplace(journal->j_devname, '/', '!');
sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
@@ -1471,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_SYNC);
+ jbd2_write_superblock(journal, WRITE_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1538,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
/* Can't have checksum v2 and v3 at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
"at the same time!\n");
goto out;
}
- if (jbd2_journal_has_csum_v2or3(journal) &&
- JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
"at the same time!\n");
@@ -1560,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3(journal)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1573,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal)
/* Check superblock checksum */
if (!jbd2_superblock_csum_verify(journal, sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
goto out;
}
@@ -1664,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal)
printk(KERN_ERR "JBD2: journal transaction %u on %s "
"is corrupt.\n", journal->j_failed_commit,
journal->j_devname);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* OK, we've finished with the dynamic journal bits:
@@ -2086,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
- if (errno)
+ if (errno) {
jbd2_journal_update_sb_errno(journal);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
+ }
}
/**
@@ -2212,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal)
{
size_t sz;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(journal))
return sizeof(journal_block_tag3_t);
sz = sizeof(journal_block_tag_t);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_has_feature_csum2(journal))
sz += sizeof(__u16);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
return sz;
else
return sz - sizeof(__u32);
@@ -2363,7 +2353,7 @@ static int jbd2_journal_init_journal_head_cache(void)
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
- SLAB_TEMPORARY, /* flags */
+ SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
NULL); /* ctor */
retval = 0;
if (!jbd2_journal_head_cache) {
@@ -2395,10 +2385,8 @@ static struct journal_head *journal_alloc_journal_head(void)
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
- while (!ret) {
- yield();
- ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
- }
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache,
+ GFP_NOFS | __GFP_NOFAIL);
}
return ret;
}
diff --git a/kernel/fs/jbd2/recovery.c b/kernel/fs/jbd2/recovery.c
index a9079d035..7f277e49f 100644
--- a/kernel/fs/jbd2/recovery.c
+++ b/kernel/fs/jbd2/recovery.c
@@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
@@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
@@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal,
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
- err = -EIO;
+ err = -EFSBADCRC;
brelse(bh);
goto failed;
}
@@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ jbd2_has_feature_checksum(journal) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal,
journal, tag, obh->b_data,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
- success = -EIO;
+ success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
@@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ jbd2_has_feature_checksum(journal)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
@@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal,
if (chksum_err) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal,
bh->b_data)) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
rcount = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
- return -EINVAL;
+ return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
@@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
return -EINVAL;
max = rcount;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
record_len = 8;
while (offset + record_len <= max) {
diff --git a/kernel/fs/jbd2/revoke.c b/kernel/fs/jbd2/revoke.c
index 14214da80..705ae5778 100644
--- a/kernel/fs/jbd2/revoke.c
+++ b/kernel/fs/jbd2/revoke.c
@@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
+ gfp_t gfp_mask = GFP_NOFS;
-repeat:
- record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+ if (journal_oom_retry)
+ gfp_mask |= __GFP_NOFAIL;
+ record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask);
if (!record)
- goto oom;
+ return -ENOMEM;
record->sequence = seq;
record->blocknr = blocknr;
@@ -154,13 +156,6 @@ repeat:
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
-
-oom:
- if (!journal_oom_retry)
- return -ENOMEM;
- jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
- yield();
- goto repeat;
}
/* Find a revoke record in the journal's hash table. */
@@ -594,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal,
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
sz = 8;
else
sz = 4;
@@ -624,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal,
*descriptorp = descriptor;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
else
diff --git a/kernel/fs/jbd2/transaction.c b/kernel/fs/jbd2/transaction.c
index ff2f2e6ad..ca181e81c 100644
--- a/kernel/fs/jbd2/transaction.c
+++ b/kernel/fs/jbd2/transaction.c
@@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* attach this handle to a new transaction.
*/
atomic_sub(total, &t->t_outstanding_credits);
+
+ /*
+ * Is the number of reserved credits in the current transaction too
+ * big to fit this handle? Wait until reserved credits are freed.
+ */
+ if (atomic_read(&journal->j_reserved_credits) + total >
+ journal->j_max_transaction_buffers) {
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + total <=
+ journal->j_max_transaction_buffers);
+ return 1;
+ }
+
wait_transaction_locked(journal);
return 1;
}
@@ -262,38 +276,36 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
int rsv_blocks = 0;
unsigned long ts = jiffies;
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
/*
- * 1/2 of transaction can be reserved so we can practically handle
- * only 1/2 of maximum transaction size per operation
+ * Limit the number of reserved credits to 1/2 of maximum transaction
+ * size and limit the number of total credits to not exceed maximum
+ * transaction size per operation.
*/
- if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
- printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, blocks,
- journal->j_max_transaction_buffers / 2);
+ if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
+ (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ printk(KERN_ERR "JBD2: %s wants too many credits "
+ "credits:%d rsv_credits:%d max:%d\n",
+ current->comm, blocks, rsv_blocks,
+ journal->j_max_transaction_buffers);
+ WARN_ON(1);
return -ENOSPC;
}
- if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
-
alloc_transaction:
if (!journal->j_running_transaction) {
+ /*
+ * If __GFP_FS is not present, then we may be being called from
+ * inside the fs writeback layer, so we MUST NOT fail.
+ */
+ if ((gfp_mask & __GFP_FS) == 0)
+ gfp_mask |= __GFP_NOFAIL;
new_transaction = kmem_cache_zalloc(transaction_cache,
gfp_mask);
- if (!new_transaction) {
- /*
- * If __GFP_FS is not present, then we may be
- * being called from inside the fs writeback
- * layer, so we MUST NOT fail. Since
- * __GFP_NOFAIL is going away, we will arrange
- * to retry the allocation ourselves.
- */
- if ((gfp_mask & __GFP_FS) == 0) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto alloc_transaction;
- }
+ if (!new_transaction)
return -ENOMEM;
- }
}
jbd_debug(3, "New handle %p going live.\n", handle);
@@ -761,6 +773,30 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
+static void jbd2_freeze_jh_data(struct journal_head *jh)
+{
+ struct page *page;
+ int offset;
+ char *source;
+ struct buffer_head *bh = jh2bh(jh);
+
+ J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
+ page = bh->b_page;
+ offset = offset_in_page(bh->b_data);
+ source = kmap_atomic(page);
+ /* Fire data frozen trigger just before we copy the data */
+ jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source + offset, bh->b_size);
+ kunmap_atomic(source);
+
+ /*
+ * Now that the frozen data is saved off, we need to store any matching
+ * triggers.
+ */
+ jh->b_frozen_triggers = jh->b_triggers;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -780,7 +816,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
journal_t *journal;
int error;
char *frozen_buffer = NULL;
- int need_copy = 0;
unsigned long start_lock, time_lock;
if (is_handle_aborted(handle))
@@ -867,119 +902,96 @@ repeat:
jh->b_modified = 0;
/*
+ * If the buffer is not journaled right now, we need to make sure it
+ * doesn't get written to disk before the caller actually commits the
+ * new data
+ */
+ if (!jh->b_transaction) {
+ JBUFFER_TRACE(jh, "no transaction");
+ J_ASSERT_JH(jh, !jh->b_next_transaction);
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ /*
+ * Make sure all stores to jh (b_modified, b_frozen_data) are
+ * visible before attaching it to the running transaction.
+ * Paired with barrier in jbd2_write_access_granted()
+ */
+ smp_wmb();
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+ spin_unlock(&journal->j_list_lock);
+ goto done;
+ }
+ /*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
if (jh->b_frozen_data) {
JBUFFER_TRACE(jh, "has frozen data");
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- jh->b_next_transaction = transaction;
- goto done;
+ goto attach_next;
}
- /* Is there data here we need to preserve? */
+ JBUFFER_TRACE(jh, "owned by older transaction");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
- if (jh->b_transaction && jh->b_transaction != transaction) {
- JBUFFER_TRACE(jh, "owned by older transaction");
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
+ /*
+ * There is one case we have to be very careful about. If the
+ * committing transaction is currently writing this buffer out to disk
+ * and has NOT made a copy-out, then we cannot modify the buffer
+ * contents at all right now. The essence of copy-out is that it is
+ * the extra copy, not the primary copy, which gets journaled. If the
+ * primary copy is already going to disk then we cannot do copy-out
+ * here.
+ */
+ if (buffer_shadow(bh)) {
+ JBUFFER_TRACE(jh, "on shadow: sleep");
+ jbd_unlock_bh_state(bh);
+ wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
+ goto repeat;
+ }
- /* There is one case we have to be very careful about.
- * If the committing transaction is currently writing
- * this buffer out to disk and has NOT made a copy-out,
- * then we cannot modify the buffer contents at all
- * right now. The essence of copy-out is that it is the
- * extra copy, not the primary copy, which gets
- * journaled. If the primary copy is already going to
- * disk then we cannot do copy-out here. */
-
- if (buffer_shadow(bh)) {
- JBUFFER_TRACE(jh, "on shadow: sleep");
+ /*
+ * Only do the copy if the currently-owning transaction still needs it.
+ * If buffer isn't on BJ_Metadata list, the committing transaction is
+ * past that stage (here we use the fact that BH_Shadow is set under
+ * bh_state lock together with refiling to BJ_Shadow list and at this
+ * point we know the buffer doesn't have BH_Shadow set).
+ *
+ * Subtle point, though: if this is a get_undo_access, then we will be
+ * relying on the frozen_data to contain the new value of the
+ * committed_data record after the transaction, so we HAVE to force the
+ * frozen_data copy in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
+ JBUFFER_TRACE(jh, "generate frozen data");
+ if (!frozen_buffer) {
+ JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- wait_on_bit_io(&bh->b_state, BH_Shadow,
- TASK_UNINTERRUPTIBLE);
- goto repeat;
- }
-
- /*
- * Only do the copy if the currently-owning transaction still
- * needs it. If buffer isn't on BJ_Metadata list, the
- * committing transaction is past that stage (here we use the
- * fact that BH_Shadow is set under bh_state lock together with
- * refiling to BJ_Shadow list and at this point we know the
- * buffer doesn't have BH_Shadow set).
- *
- * Subtle point, though: if this is a get_undo_access,
- * then we will be relying on the frozen_data to contain
- * the new value of the committed_data record after the
- * transaction, so we HAVE to force the frozen_data copy
- * in that case.
- */
- if (jh->b_jlist == BJ_Metadata || force_copy) {
- JBUFFER_TRACE(jh, "generate frozen data");
+ frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!frozen_buffer) {
- JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
- frozen_buffer =
- jbd2_alloc(jh2bh(jh)->b_size,
- GFP_NOFS);
- if (!frozen_buffer) {
- printk(KERN_ERR
- "%s: OOM for frozen_buffer\n",
- __func__);
- JBUFFER_TRACE(jh, "oom!");
- error = -ENOMEM;
- jbd_lock_bh_state(bh);
- goto done;
- }
- goto repeat;
+ printk(KERN_ERR "%s: OOM for frozen_buffer\n",
+ __func__);
+ JBUFFER_TRACE(jh, "oom!");
+ error = -ENOMEM;
+ goto out;
}
- jh->b_frozen_data = frozen_buffer;
- frozen_buffer = NULL;
- need_copy = 1;
+ goto repeat;
}
- jh->b_next_transaction = transaction;
+ jh->b_frozen_data = frozen_buffer;
+ frozen_buffer = NULL;
+ jbd2_freeze_jh_data(jh);
}
-
-
+attach_next:
/*
- * Finally, if the buffer is not journaled right now, we need to make
- * sure it doesn't get written to disk before the caller actually
- * commits the new data
+ * Make sure all stores to jh (b_modified, b_frozen_data) are visible
+ * before attaching it to the running transaction. Paired with barrier
+ * in jbd2_write_access_granted()
*/
- if (!jh->b_transaction) {
- JBUFFER_TRACE(jh, "no transaction");
- J_ASSERT_JH(jh, !jh->b_next_transaction);
- JBUFFER_TRACE(jh, "file as BJ_Reserved");
- spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
- spin_unlock(&journal->j_list_lock);
- }
+ smp_wmb();
+ jh->b_next_transaction = transaction;
done:
- if (need_copy) {
- struct page *page;
- int offset;
- char *source;
-
- J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
- "Possible IO failure.\n");
- page = jh2bh(jh)->b_page;
- offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page);
- /* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset,
- jh->b_triggers);
- memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source);
-
- /*
- * Now that the frozen data is saved off, we need to store
- * any matching triggers.
- */
- jh->b_frozen_triggers = jh->b_triggers;
- }
jbd_unlock_bh_state(bh);
/*
@@ -996,6 +1008,59 @@ out:
return error;
}
+/* Fast check whether buffer is already attached to the required transaction */
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
+ bool undo)
+{
+ struct journal_head *jh;
+ bool ret = false;
+
+ /* Dirty buffers require special handling... */
+ if (buffer_dirty(bh))
+ return false;
+
+ /*
+ * RCU protects us from dereferencing freed pages. So the checks we do
+ * are guaranteed not to oops. However the jh slab object can get freed
+ * & reallocated while we work with it. So we have to be careful. When
+ * we see jh attached to the running transaction, we know it must stay
+ * so until the transaction is committed. Thus jh won't be freed and
+ * will be attached to the same bh while we run. However it can
+ * happen jh gets freed, reallocated, and attached to the transaction
+ * just after we get pointer to it from bh. So we have to be careful
+ * and recheck jh still belongs to our bh before we return success.
+ */
+ rcu_read_lock();
+ if (!buffer_jbd(bh))
+ goto out;
+ /* This should be bh2jh() but that doesn't work with inline functions */
+ jh = READ_ONCE(bh->b_private);
+ if (!jh)
+ goto out;
+ /* For undo access buffer must have data copied */
+ if (undo && !jh->b_committed_data)
+ goto out;
+ if (jh->b_transaction != handle->h_transaction &&
+ jh->b_next_transaction != handle->h_transaction)
+ goto out;
+ /*
+ * There are two reasons for the barrier here:
+ * 1) Make sure to fetch b_bh after we did previous checks so that we
+ * detect when jh went through free, realloc, attach to transaction
+ * while we were checking. Paired with implicit barrier in that path.
+ * 2) So that access to bh done after jbd2_write_access_granted()
+ * doesn't get reordered and see inconsistent state of concurrent
+ * do_get_write_access().
+ */
+ smp_mb();
+ if (unlikely(jh->b_bh != bh))
+ goto out;
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/**
* int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
@@ -1009,9 +1074,13 @@ out:
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
- struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ struct journal_head *jh;
int rc;
+ if (jbd2_write_access_granted(handle, bh, false))
+ return 0;
+
+ jh = jbd2_journal_add_journal_head(bh);
/* We do not want to get caught playing with fields which the
* log thread also manipulates. Make sure that the buffer
* completes any outstanding IO before proceeding. */
@@ -1141,11 +1210,14 @@ out:
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
int err;
- struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ struct journal_head *jh;
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
+ if (jbd2_write_access_granted(handle, bh, true))
+ return 0;
+ jh = jbd2_journal_add_journal_head(bh);
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1230,8 +1302,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
triggers->t_abort(triggers, jh2bh(jh));
}
-
-
/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
@@ -1264,12 +1334,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
if (is_handle_aborted(handle))
return -EROFS;
- journal = transaction->t_journal;
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh) {
+ if (!buffer_jbd(bh)) {
ret = -EUCLEAN;
goto out;
}
+ /*
+ * We don't grab jh reference here since the buffer must be part
+ * of the running transaction.
+ */
+ jh = bh2jh(bh);
+ /*
+ * This and the following assertions are unreliable since we may see jh
+ * in inconsistent state unless we grab bh_state lock. But this is
+ * crucial to catch bugs so let's do a reliable check until the
+ * lockless handling is fully proven.
+ */
+ if (jh->b_transaction != transaction &&
+ jh->b_next_transaction != transaction) {
+ jbd_lock_bh_state(bh);
+ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+ jh->b_next_transaction == transaction);
+ jbd_unlock_bh_state(bh);
+ }
+ if (jh->b_modified == 1) {
+ /* If it's in our transaction it must be in BJ_Metadata list. */
+ if (jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata) {
+ jbd_lock_bh_state(bh);
+ J_ASSERT_JH(jh, jh->b_transaction != transaction ||
+ jh->b_jlist == BJ_Metadata);
+ jbd_unlock_bh_state(bh);
+ }
+ goto out;
+ }
+
+ journal = transaction->t_journal;
jbd_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
@@ -1360,7 +1459,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
- jbd2_journal_put_journal_head(jh);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
@@ -1843,8 +1941,8 @@ out:
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
@@ -2058,6 +2156,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
if (!buffer_dirty(bh)) {
/* bdflush has written it. We can drop it now */
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
@@ -2087,6 +2186,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
clear_buffer_jbddirty(bh);
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
}