summaryrefslogtreecommitdiffstats
path: root/kernel/fs/btrfs/send.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fs/btrfs/send.c')
-rw-r--r--kernel/fs/btrfs/send.c377
1 files changed, 320 insertions, 57 deletions
diff --git a/kernel/fs/btrfs/send.c b/kernel/fs/btrfs/send.c
index a1216f9b4..63a6152be 100644
--- a/kernel/fs/btrfs/send.c
+++ b/kernel/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ bool orphanized;
};
struct orphan_dir_info {
@@ -1158,6 +1159,9 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
+ /* data offset in the file extent item */
+ u64 data_offset;
+
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
if (ret < 0)
return ret;
- if (offset + bctx->extent_len > i_size)
+ if (offset + bctx->data_offset + bctx->extent_len > i_size)
return 0;
/*
@@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
+ /*
+ * For non-compressed extents iterate_extent_inodes() gives us extent
+ * offsets that already take into account the data offset, but not for
+ * compressed extents, since the offset is logical and not relative to
+ * the physical extent locations. We must take this into account to
+ * avoid sending clone offsets that go beyond the source file's size,
+ * which would result in the clone ioctl failing with -EINVAL on the
+ * receiving end.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ backref_ctx->data_offset = 0;
+ else
+ backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -1417,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
- if (compressed != BTRFS_COMPRESS_NONE) {
- /*
- * Offsets given by iterate_extent_inodes() are relative
- * to the start of the extent, we need to add logical
- * offset from the file extent item.
- * (See why at backref.c:check_extent_in_eb())
- */
- cur_clone_root->offset += btrfs_file_extent_offset(eb,
- fi);
- }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1462,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
@@ -1900,8 +1921,15 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- /* we know that it is or will be overwritten. check this now */
- if (ow_inode < sctx->send_progress)
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore check if ow_inode matches
+ * the current inode being processed.
+ */
+ if ((ow_inode < sctx->send_progress) ||
+ (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+ gen == sctx->cur_inode_gen))
ret = 1;
else
ret = 0;
@@ -2223,6 +2251,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2263,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
break;
}
- if (is_waiting_for_move(sctx, ino)) {
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
@@ -2323,13 +2357,23 @@ static int send_subvol_begin(struct send_ctx *sctx)
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
- sctx->send_root->root_item.uuid);
+
+ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- sctx->parent_root->root_item.uuid);
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
@@ -2530,7 +2574,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
} else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
} else {
- printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
goto out;
@@ -2923,7 +2967,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
return entry != NULL;
}
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
{
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2934,6 +2978,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->orphanized = orphanized;
while (*p) {
parent = *p;
@@ -3030,7 +3075,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = add_waiting_dir_move(sctx, pm->ino);
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
if (ret)
goto out;
@@ -3353,8 +3398,40 @@ out:
return ret;
}
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ int ret;
+ u64 parent;
+ u64 parent_gen;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0) {
+ if (ret == -ENOENT && ino == ino2)
+ ret = 0;
+ return ret;
+ }
+ if (parent == ino1)
+ return parent_gen == ino1_gen ? 1 : 0;
+ ino = parent;
+ }
+ return 0;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
- struct recorded_ref *parent_ref)
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
{
int ret = 0;
u64 ino = parent_ref->dir;
@@ -3374,11 +3451,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
* Our current directory inode may not yet be renamed/moved because some
* ancestor (immediate or not) has to be renamed/moved first. So find if
* such ancestor exists and make sure our own rename/move happens after
- * that ancestor is processed.
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
*/
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
if (is_waiting_for_move(sctx, ino)) {
- ret = 1;
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
break;
}
@@ -3420,7 +3510,7 @@ out:
ino,
&sctx->new_refs,
&sctx->deleted_refs,
- false);
+ is_orphan);
if (!ret)
ret = 1;
}
@@ -3589,6 +3679,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
@@ -3609,18 +3710,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = wait_for_parent_move(sctx, cur);
- if (ret < 0)
- goto out;
- if (ret) {
- *pending_move = 1;
- } else {
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
- cur->full_path);
- }
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
if (ret < 0)
goto out;
} else {
@@ -4508,8 +4602,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
if (ret < 0)
goto out;
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root->root->root_item.uuid);
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
@@ -4590,6 +4697,171 @@ tlv_put_failure:
return ret;
}
+static int send_extent_data(struct send_ctx *sctx,
+ const u64 offset,
+ const u64 len)
+{
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ while (sent < len) {
+ u64 size = len - sent;
+ int ret;
+
+ if (size > BTRFS_SEND_READ_SIZE)
+ size = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ break;
+ sent += ret;
+ }
+ return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+ struct clone_root *clone_root,
+ const u64 disk_byte,
+ u64 data_offset,
+ u64 offset,
+ u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+ ext_len = PAGE_CACHE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, offset, hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ btrfs_file_extent_offset(leaf, ei) == data_offset)
+ ret = send_clone(sctx, offset, clone_len, clone_root);
+ else
+ ret = send_extent_data(sctx, offset, clone_len);
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -4598,9 +4870,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 pos = 0;
u64 len;
- u32 l;
u8 type;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
@@ -4628,22 +4898,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
}
if (clone_root && IS_ALIGNED(offset + len, bs)) {
- ret = send_clone(sctx, offset, len, clone_root);
- } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
- ret = send_update_extent(sctx, offset, len);
+ u64 disk_byte;
+ u64 data_offset;
+
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+ offset, len);
} else {
- while (pos < len) {
- l = len - pos;
- if (l > BTRFS_SEND_READ_SIZE)
- l = BTRFS_SEND_READ_SIZE;
- ret = send_write(sctx, pos + offset, l);
- if (ret < 0)
- goto out;
- if (!ret)
- break;
- pos += ret;
- }
- ret = 0;
+ ret = send_extent_data(sctx, offset, len);
}
out:
return ret;