summaryrefslogtreecommitdiffstats
path: root/kernel/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fs/ext4/inode.c')
-rw-r--r--kernel/fs/ext4/inode.c238
1 files changed, 155 insertions, 83 deletions
diff --git a/kernel/fs/ext4/inode.c b/kernel/fs/ext4/inode.c
index 966c61482..06bda0361 100644
--- a/kernel/fs/ext4/inode.c
+++ b/kernel/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -377,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
"lblock %lu mapped to illegal pblock "
"(length %d)", (unsigned long) map->m_lblk,
map->m_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -479,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* We can handle the block number less than EXT_MAX_BLOCKS */
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
- return -EIO;
+ return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@ -656,16 +657,32 @@ has_zeroout:
return retval;
}
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
+ unsigned long old_state;
+ unsigned long new_state;
+
+ flags &= EXT4_MAP_FLAGS;
+
+ /* Dummy buffer_head? Set non-atomically. */
+ if (!bh->b_page) {
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+ }
+ /*
+ * Someone else may be modifying b_state. Be careful! This is ugly but
+ * once we get rid of using bh as a container for mapping information
+ * to pass to / from get_block functions, this can go away.
+ */
+ do {
+ old_state = READ_ONCE(bh->b_state);
+ new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+ } while (unlikely(
+ cmpxchg(&bh->b_state, old_state, new_state) != old_state));
}
/* Maximum number of blocks we map for direct IO at once. */
@@ -704,11 +721,16 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_io_end_t *io_end = ext4_inode_aio(inode);
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ ext4_update_bh_state(bh, map.m_flags);
+ if (IS_DAX(inode) && buffer_unwritten(bh)) {
+ /*
+ * dgc: I suspect unwritten conversion on ext4+DAX is
+ * fundamentally broken here when there are concurrent
+ * read/write in progress on this inode.
+ */
+ WARN_ON_ONCE(io_end);
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
- bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
@@ -731,18 +753,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create)
+ ext4_lblk_t block, int map_flags)
{
struct ext4_map_blocks map;
struct buffer_head *bh;
+ int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
J_ASSERT(handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
- err = ext4_map_blocks(handle, inode, &map,
- create ? EXT4_GET_BLOCKS_CREATE : 0);
+ err = ext4_map_blocks(handle, inode, &map, map_flags);
if (err == 0)
return create ? ERR_PTR(-ENOSPC) : NULL;
@@ -788,11 +810,11 @@ errout:
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create)
+ ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
- bh = ext4_getblk(handle, inode, block, create);
+ bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || buffer_uptodate(bh))
@@ -971,7 +993,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = ext4_decrypt_one(inode, page);
+ err = ext4_decrypt(page);
return err;
}
#endif
@@ -1187,6 +1209,38 @@ errout:
return ret ? ret : copied;
}
+/*
+ * This is a private version of page_zero_new_buffers() which doesn't
+ * set the buffer to be dirty, since in data=journalled mode we need
+ * to call ext4_handle_dirty_metadata() instead.
+ */
+static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start = 0, block_end;
+ struct buffer_head *head, *bh;
+
+ bh = head = page_buffers(page);
+ do {
+ block_end = block_start + bh->b_size;
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, size;
+
+ start = max(from, block_start);
+ size = min(to, block_end) - start;
+
+ zero_user(page, start, size);
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_new(bh);
+ }
+ }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1213,7 +1267,7 @@ static int ext4_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ zero_new_buffers(page, from+copied, to);
}
ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@ -1261,13 +1315,12 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve a single cluster located at lblock
+ * Reserve space for a single cluster
*/
-static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+static int ext4_da_reserve_space(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned int md_needed;
int ret;
/*
@@ -1279,25 +1332,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
if (ret)
return ret;
- /*
- * recalculate the amount of metadata blocks to reserve
- * in order to allocate nrblocks
- * worse case is one extent per block
- */
spin_lock(&ei->i_block_reservation_lock);
- /*
- * ext4_calc_metadata_amount() has side effects, which we have
- * to be prepared undo if we fail to claim space.
- */
- md_needed = 0;
- trace_ext4_da_reserve_space(inode, 0);
-
if (ext4_claim_free_clusters(sbi, 1, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
ei->i_reserved_data_blocks++;
+ trace_ext4_da_reserve_space(inode);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1575,9 +1617,9 @@ add_delayed:
* then we don't need to reserve it again. However we still need
* to reserve metadata for every block we're going to write.
*/
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, map->m_lblk)) {
- ret = ext4_da_reserve_space(inode, iblock);
+ ret = ext4_da_reserve_space(inode);
if (ret) {
/* not enough space to reserve */
retval = ret;
@@ -1655,7 +1697,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (buffer_unwritten(bh)) {
/* A delayed write to unwritten bh should be marked
@@ -1833,11 +1875,22 @@ static int ext4_writepage(struct page *page,
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
+ *
+ * Also, if there is only one buffer per page (the fs block
+ * size == the page size), if one buffer needs block
+ * allocation or needs to modify the extent tree to clear the
+ * unwritten flag, we know that the page can't be written at
+ * all, so we might as well refuse the write immediately.
+ * Unfortunately if the block size != page size, we can't as
+ * easily detect this case using ext4_walk_page_buffers(), but
+ * for the extremely common case, this is an optimization that
+ * skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
- if (current->flags & PF_MEMALLOC) {
+ if ((current->flags & PF_MEMALLOC) ||
+ (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2617,8 +2670,7 @@ static int ext4_nonda_switch(struct super_block *sb)
/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
- if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ if (likely(ext4_has_feature_large_file(inode->i_sb)))
return 1;
if (pos + len <= 0x7fffffffULL)
@@ -3039,6 +3091,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
@@ -3351,7 +3414,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
int err = 0;
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
@@ -3400,7 +3463,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!ext4_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_CACHE_SIZE);
- WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ WARN_ON_ONCE(ext4_decrypt(page));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3827,7 +3890,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
- return -EIO;
+ return -EFSCORRUPTED;
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@ -4013,8 +4076,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct inode *inode = &(ei->vfs_inode);
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
@@ -4075,7 +4137,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
}
} else
@@ -4095,7 +4157,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
EXT4_ERROR_INODE(inode, "checksum invalid");
- ret = -EIO;
+ ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4137,7 +4199,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
@@ -4210,7 +4272,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
ei->i_file_acl);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@ -4237,8 +4299,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode) &&
- !ext4_encrypted_inode(inode)) {
+ if (ext4_encrypted_inode(inode)) {
+ inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ ext4_set_aops(inode);
+ } else if (ext4_inode_is_fast_symlink(inode)) {
+ inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
@@ -4258,7 +4323,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else if (ino == EXT4_BOOT_LOADER_INO) {
make_bad_inode(inode);
} else {
- ret = -EIO;
+ ret = -EFSCORRUPTED;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
@@ -4276,7 +4341,7 @@ bad_inode:
struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
{
if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
return ext4_iget(sb, ino);
}
@@ -4298,7 +4363,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ if (!ext4_has_feature_huge_file(sb))
return -EFBIG;
if (i_blocks <= 0xffffffffffffULL) {
@@ -4459,8 +4524,7 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ if (!ext4_has_feature_large_file(sb) ||
EXT4_SB(sb)->s_es->s_rev_level ==
cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
@@ -4509,8 +4573,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (err)
goto out_brelse;
ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
}
@@ -4677,8 +4740,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
@@ -4707,8 +4773,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
+ loff_t oldsize = inode->i_size;
+ int shrink = (attr->ia_size <= inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4716,27 +4784,37 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
inode_inc_iversion(inode);
- if (S_ISREG(inode->i_mode) &&
+ if (ext4_should_order_data(inode) &&
(attr->ia_size < inode->i_size)) {
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error)
- goto err_out;
- }
+ if (error)
+ goto err_out;
+ }
+ if (attr->ia_size != inode->i_size) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- if (ext4_handle_valid(handle)) {
+ if (ext4_handle_valid(handle) && shrink) {
error = ext4_orphan_add(handle, inode);
orphan = 1;
}
+ /*
+ * Update c/mtime on truncate up, ext4_truncate() will
+ * update c/mtime in shrink case below
+ */
+ if (!shrink) {
+ inode->i_mtime = ext4_current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
down_write(&EXT4_I(inode)->i_data_sem);
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
@@ -4752,15 +4830,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error) {
- ext4_orphan_del(NULL, inode);
+ if (orphan)
+ ext4_orphan_del(NULL, inode);
goto err_out;
}
- } else {
- loff_t oldsize = inode->i_size;
-
- i_size_write(inode, attr->ia_size);
- pagecache_isize_extended(inode, oldsize, inode->i_size);
}
+ if (!shrink)
+ pagecache_isize_extended(inode, oldsize, inode->i_size);
/*
* Blocks are going to be removed from the inode. Wait
@@ -4780,13 +4856,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
+ if (shrink)
+ ext4_truncate(inode);
}
- /*
- * We want to call ext4_truncate() even if attr->ia_size ==
- * inode->i_size for cases like truncation of fallocated space
- */
- if (attr->ia_valid & ATTR_SIZE)
- ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
@@ -5239,7 +5311,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = __block_page_mkwrite(vma, vmf,
+ ret = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5286,7 +5358,7 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = __block_page_mkwrite(vma, vmf, get_block);
+ ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {